cleaned up

Browse files

Files changed (4) hide show

README.md +53 -39
__pycache__/ssllm_hf.cpython-310.pyc +0 -0
generate.py +3 -14
ssllm_hf.py +2 -192

README.md CHANGED Viewed

@@ -80,52 +80,66 @@ SSLLM is a 218M parameter decoder-only transformer language model created for te
 from ssllm_hf import SSLLMForCausalLM, SSLLMConfig
 import tiktoken
 import torch
-# Load model and config
-config = SSLLMConfig.from_pretrained('ssllm_hf')
-model = SSLLMForCausalLM.from_pretrained('ssllm_hf')
-# Load tokenizer
-tokenizer = tiktoken.get_encoding('cl100k_base')
-# Generate text
-prompt = "The future of artificial intelligence will"
-input_ids = torch.tensor([tokenizer.encode(prompt)])
-with torch.no_grad():
-    outputs = model.generate(
-        input_ids,
-        max_new_tokens=128,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.2,
-        no_repeat_ngram_size=4,
-        pad_token_id=100257,
-        eos_token_id=100257,
-    )
-generated_text = tokenizer.decode(outputs[0].tolist())
-print(generated_text)
-```
-## Performance
-### Generation Quality
-The model produces coherent, contextually relevant text with:
-- ✅ Logical narrative flow
-- ✅ Reduced repetition through penalty mechanisms
-- ✅ Diverse outputs across temperature settings
-- ✅ Contextual relevance to input prompts
-- ✅ Creative storytelling capabilities
-### Example Outputs
 **Prompt:** "In a small village nestled between mountains,"
-**Output:** "In a small village nestled between mountains, lived two best friends - Benny the Bunny and Daisy the Deer. They loved exploring their surroundings together! One sunny day, they stumbled upon an old book titled 'The Book of Life.' Excitedly, they decided to read it aloud..."
 ## Limitations
@@ -135,7 +149,7 @@ The model produces coherent, contextually relevant text with:
 - **Tokenizer:** Requires tiktoken library (not standard HuggingFace tokenizer)
 - **Special Tokens:** Limited special token vocabulary
-## Ethical Considerations
 - Model outputs should be reviewed for potential biases
 - Not suitable for generating harmful or inappropriate content
@@ -155,7 +169,7 @@ The model produces coherent, contextually relevant text with:
 - **Framework:** PyTorch
 - **HuggingFace Transformers:** Compatible with generation utilities
-- **vLLM:** Requires GPT-2 format conversion (see conversion scripts)
 - **ONNX:** Not currently supported
 - **TensorFlow:** Not supported

 from ssllm_hf import SSLLMForCausalLM, SSLLMConfig
 import tiktoken
 import torch
+from safetensors.torch import load_file
+from huggingface_hub import hf_hub_download
+# Initialize model with config
+config = SSLLMConfig.from_pretrained('sausheong/ssllm_hf')
+model = SSLLMForCausalLM(config)
+# Download and load model weights
+model_path = hf_hub_download(repo_id='sausheong/ssllm_hf', filename='model.safetensors')
+state_dict = load_file(model_path)
+model.load_state_dict(state_dict, strict=False)
+# Setup device and eval mode
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = model.to(device).eval()
+# Initialize tokenizer
+tokenizer = tiktoken.get_encoding('cl100k_base')
+def generate_text(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, top_k=40):
+    # Encode the prompt
+    input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
+    attention_mask = torch.ones_like(input_ids)
+    # Generate with the model
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            pad_token_id=100257,
+            eos_token_id=100257,
+        )
+    # Decode only the new tokens
+    new_tokens = outputs[0][input_ids.shape[1]:].tolist()
+    generated = tokenizer.decode(new_tokens)
+    print(f"\n{prompt}{generated}")
+    print(f"\nTokens generated: {len(new_tokens)}")
+if __name__ == "__main__":
+    prompt = "In a small village nestled between mountains,"
+    # Test different generation settings
+    print(f"PROMPT: {prompt}")
+    generate_text(prompt)
+```
+## Example Outputs
 **Prompt:** "In a small village nestled between mountains,"
+**Output:** "In a small village nestled between mountains, lived two curious friends named Sam and Alex. They were always curious and loved learning new things. One day, while exploring the woods near the riverbank, they stumbled upon a mysterious object. It was a tiny, glowing object with a glowing light.
+Sam explained that it had a special kind of light that could change how the light behaves. He told them that the light was made up of different colors and patterns, making it an even better way to see clearly. This made Sam and Alex curious."
 ## Limitations
 - **Tokenizer:** Requires tiktoken library (not standard HuggingFace tokenizer)
 - **Special Tokens:** Limited special token vocabulary
+## Considerations
 - Model outputs should be reviewed for potential biases
 - Not suitable for generating harmful or inappropriate content
 - **Framework:** PyTorch
 - **HuggingFace Transformers:** Compatible with generation utilities
+- **vLLM:** No (Requires GPT-2 format conversion)
 - **ONNX:** Not currently supported
 - **TensorFlow:** Not supported

__pycache__/ssllm_hf.cpython-310.pyc ADDED Viewed

Binary file (6.25 kB). View file

generate.py CHANGED Viewed

@@ -21,10 +21,6 @@ model = model.to(device).eval()
 tokenizer = tiktoken.get_encoding('cl100k_base')
 def generate_text(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, top_k=40):
-    print(f"\n{'='*60}")
-    print(f"Temperature: {temperature}, Top-p: {top_p}, Top-k: {top_k}")
-    print(f"{'='*60}")
     # Encode the prompt
     input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
     attention_mask = torch.ones_like(input_ids)
@@ -45,21 +41,14 @@ def generate_text(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, top_k=
     # Decode only the new tokens
     new_tokens = outputs[0][input_ids.shape[1]:].tolist()
-    try:
-        # Try normal decoding first
-        generated = tokenizer.decode(new_tokens)
-    except:
-        # Fallback to per-token decoding if there are any invalid tokens
-        generated = ''.join(tokenizer.decode([t]) for t in new_tokens if t < tokenizer.n_vocab)
     print(f"\n{prompt}{generated}")
     print(f"\nTokens generated: {len(new_tokens)}")
 if __name__ == "__main__":
-    prompt = "The future of artificial intelligence will"
     # Test different generation settings
     print(f"PROMPT: {prompt}")
-    generate_text(prompt, temperature=0.9, top_p=0.95, top_k=50)  # Creative
-    generate_text(prompt, temperature=0.7, top_p=0.9, top_k=40)   # Balanced
-    generate_text(prompt, temperature=0.3, top_p=0.8, top_k=20)   # Focused

 tokenizer = tiktoken.get_encoding('cl100k_base')
 def generate_text(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, top_k=40):
     # Encode the prompt
     input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
     attention_mask = torch.ones_like(input_ids)
     # Decode only the new tokens
     new_tokens = outputs[0][input_ids.shape[1]:].tolist()
+    generated = tokenizer.decode(new_tokens)
     print(f"\n{prompt}{generated}")
     print(f"\nTokens generated: {len(new_tokens)}")
 if __name__ == "__main__":
+    prompt = "In a small village nestled between mountains,"
     # Test different generation settings
     print(f"PROMPT: {prompt}")
+    generate_text(prompt)

ssllm_hf.py CHANGED Viewed

@@ -1,16 +1,11 @@
-#!/usr/bin/env python3
 """
-Fixed conversion script that preserves all parameters and uses a custom model class
-that exactly matches SSLLM architecture but is compatible with HuggingFace.
 """
-import os
-import json
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
-import tiktoken
 class SSLLMConfig(PretrainedConfig):
     """Configuration class for SSLLM model compatible with HuggingFace."""
@@ -203,189 +198,4 @@ class SSLLMForCausalLM(PreTrainedModel, GenerationMixin):
     def set_input_embeddings(self, new_embeddings):
         """Set input embeddings."""
-        self.token_embed = new_embeddings
-def load_ssllm_checkpoint(checkpoint_path):
-    """Load the SSLLM checkpoint and extract model state and config."""
-    print(f"Loading SSLLM checkpoint from: {checkpoint_path}")
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-    if 'model_state_dict' in checkpoint:
-        state_dict = checkpoint['model_state_dict']
-        config = checkpoint.get('config', {})
-    else:
-        state_dict = checkpoint
-        config = {}
-    print(f"Loaded checkpoint with {len(state_dict)} parameters")
-    # Print parameter shapes for verification
-    total_params = sum(p.numel() for p in state_dict.values())
-    print(f"Total parameters in checkpoint: {total_params:,}")
-    return state_dict, config
-def convert_ssllm_to_hf(ssllm_state_dict, ssllm_config):
-    """Convert SSLLM state dict to HuggingFace format with exact parameter preservation."""
-    # Extract configuration
-    vocab_size = ssllm_config.get('vocab_size', 100277)
-    d_model = ssllm_config.get('d_model', 768)
-    num_heads = ssllm_config.get('num_heads', 12)
-    num_layers = ssllm_config.get('num_layers', 10)
-    d_ff = ssllm_config.get('d_ff', 2560)
-    max_seq_len = ssllm_config.get('max_seq_len', 1024)
-    print(f"Model config: vocab_size={vocab_size}, d_model={d_model}, num_heads={num_heads}")
-    print(f"              num_layers={num_layers}, d_ff={d_ff}, max_seq_len={max_seq_len}")
-    # Create SSLLM configuration
-    config = SSLLMConfig(
-        vocab_size=vocab_size,
-        d_model=d_model,
-        num_heads=num_heads,
-        num_layers=num_layers,
-        d_ff=d_ff,
-        max_seq_len=max_seq_len,
-        dropout_rate=0.1,
-        attention_dropout=0.1,
-        stochastic_depth_rate=0.1,
-        bos_token_id=100256,
-        eos_token_id=100257,
-        pad_token_id=100257,
-    )
-    # Create SSLLM model
-    model = SSLLMForCausalLM(config)
-    print(f"Created SSLLM model with {sum(p.numel() for p in model.parameters())} parameters")
-    # Load state dict directly (should be exact match)
-    missing_keys, unexpected_keys = model.load_state_dict(ssllm_state_dict, strict=False)
-    if missing_keys:
-        print(f"Missing keys: {missing_keys}")
-    if unexpected_keys:
-        print(f"Unexpected keys: {unexpected_keys}")
-    # Verify parameter count
-    model_params = sum(p.numel() for p in model.parameters())
-    checkpoint_params = sum(p.numel() for p in ssllm_state_dict.values())
-    print(f"Parameter verification:")
-    print(f"  Model parameters: {model_params:,}")
-    print(f"  Checkpoint parameters: {checkpoint_params:,}")
-    print(f"  Match: {'✅' if model_params == checkpoint_params else '❌'}")
-    return model, config
-def save_hf_model(model, config, output_dir):
-    """Save the converted model in HuggingFace format."""
-    os.makedirs(output_dir, exist_ok=True)
-    # Save model and config
-    model.save_pretrained(output_dir)
-    # Create tokenizer config for cl100k_base
-    tokenizer_config = {
-        "tokenizer_class": "tiktoken",
-        "model_name": "cl100k_base",
-        "vocab_size": 100277,
-        "bos_token": "",
-        "eos_token": "",
-        "pad_token": "",
-        "unk_token": "",
-        "mask_token": "",
-        "additional_special_tokens": []
-    }
-    with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w') as f:
-        json.dump(tokenizer_config, f, indent=2)
-    # Create generation config
-    generation_config = {
-        "bos_token_id": 100256,
-        "eos_token_id": 100257,
-        "pad_token_id": 100257,
-        "max_length": 1024,
-        "do_sample": True,
-        "temperature": 0.7,
-        "top_p": 0.9,
-        "repetition_penalty": 1.1,
-        "no_repeat_ngram_size": 3
-    }
-    with open(os.path.join(output_dir, 'generation_config.json'), 'w') as f:
-        json.dump(generation_config, f, indent=2)
-    # Create tokenizer info
-    with open(os.path.join(output_dir, 'tokenizer_info.txt'), 'w') as f:
-        f.write("Tokenizer: cl100k_base (tiktoken)\n")
-        f.write("Vocabulary size: 100277\n")
-        f.write("BOS token ID: 100256\n")
-        f.write("EOS token ID: 100257\n")
-        f.write("PAD token ID: 100257\n")
-    print(f"Model saved to: {output_dir}")
-    print("Files created:")
-    print("  - pytorch_model.bin (model weights)")
-    print("  - config.json (model configuration)")
-    print("  - tokenizer_config.json (tokenizer configuration)")
-    print("  - generation_config.json (generation parameters)")
-    print("  - tokenizer_info.txt (tokenizer metadata)")
-def main():
-    """Main conversion function."""
-    import argparse
-    parser = argparse.ArgumentParser(description='Convert SSLLM checkpoint to HuggingFace format')
-    parser.add_argument('--input', type=str, default='ssllm.pth',
-                        help='Path to SSLLM checkpoint file (default: ssllm.pth)')
-    parser.add_argument('--output', type=str, default='ssllm_hf',
-                        help='Output directory for HuggingFace model (default: ssllm_hf)')
-    args = parser.parse_args()
-    if not os.path.exists(args.input):
-        print(f"Error: Input checkpoint file '{args.input}' not found")
-        return
-    print("=" * 60)
-    print("SSLLM TO HUGGINGFACE CONVERSION")
-    print("=" * 60)
-    # Load SSLLM checkpoint
-    ssllm_state_dict, ssllm_config = load_ssllm_checkpoint(args.input)
-    # Convert to HuggingFace format with exact parameter preservation
-    model, config = convert_ssllm_to_hf(ssllm_state_dict, ssllm_config)
-    # Save in HuggingFace format
-    save_hf_model(model, config, args.output)
-    print("=" * 60)
-    print("CONVERSION COMPLETED SUCCESSFULLY!")
-    print("=" * 60)
-    print(f"Your model is now available at: {args.output}")
-    print("\nTo use with HuggingFace transformers:")
-    print("```python")
-    print("from transformers import AutoModel, AutoConfig")
-    print("import tiktoken")
-    print("")
-    print(f"# Load model")
-    print(f"model = AutoModel.from_pretrained('{args.output}', trust_remote_code=True)")
-    print("")
-    print("# Load tokenizer (tiktoken)")
-    print("tokenizer = tiktoken.get_encoding('cl100k_base')")
-    print("")
-    print("# Generate text")
-    print("input_text = 'Once upon a time'")
-    print("input_ids = torch.tensor([tokenizer.encode(input_text)])")
-    print("with torch.no_grad():")
-    print("    outputs = model.generate(input_ids, max_length=100, do_sample=True, temperature=0.7)")
-    print("generated_text = tokenizer.decode(outputs[0].tolist())")
-    print("print(generated_text)")
-    print("```")
-if __name__ == "__main__":
-    main()

 """
+A custom model for causal language modeling, compatible with HuggingFace.
 """
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 class SSLLMConfig(PretrainedConfig):
     """Configuration class for SSLLM model compatible with HuggingFace."""
     def set_input_embeddings(self, new_embeddings):
         """Set input embeddings."""
+        self.token_embed = new_embeddings