RND1-Base-0910 / app.py
Mohaddz's picture
Create app.py
08b5ccb verified
raw
history blame
8.84 kB
#!/usr/bin/env python3
"""
RND1 Diffusion Model Demo for Hugging Face Spaces with ZeroGPU
"""
import torch
import gradio as gr
import spaces
import random
import numpy as np
from transformers import AutoTokenizer
# Global model and tokenizer
model = None
tokenizer = None
device = "cuda"
def set_seed(seed: int):
"""Set random seed for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def load_model():
"""Load model and tokenizer (called once at startup)."""
global model, tokenizer
from rnd.configuration_rnd import RND1Config
from rnd.modeling_rnd import RND1LM
model_path = "radicalnumerics/RND1-Base-0910"
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print("Loading model...")
cfg = RND1Config.from_pretrained(model_path)
cfg.model_type = "rnd1"
cfg.attn_implementation = "sdpa"
cfg.moe_backend = "hf"
model = RND1LM.from_pretrained(
model_path,
config=cfg,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
use_safetensors=True,
low_cpu_mem_usage=True,
)
model.eval()
print("Model loaded successfully!")
@spaces.GPU(duration=120) # Request GPU for up to 120 seconds
def generate_text(
prompt: str,
mode: str,
num_steps: int,
max_new_tokens: int,
temperature: float,
top_k: int,
top_p: float,
seed: int,
progress=gr.Progress()
):
"""
Generate text using RND1 diffusion model.
Args:
prompt: Input text prompt
mode: Generation mode ('task' or 'completion')
num_steps: Number of diffusion steps
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature
top_k: Top-k filtering (0 to disable)
top_p: Top-p nucleus filtering (0 to disable)
seed: Random seed
progress: Gradio progress tracker
"""
if not prompt.strip():
return "⚠️ Please enter a prompt."
progress(0, desc="Setting seed...")
set_seed(seed)
progress(0.1, desc="Preparing prompt...")
# Format prompt based on mode
if mode == "task":
if not prompt.strip().startswith("Question:"):
formatted_prompt = f"Question: {prompt}\n"
else:
formatted_prompt = prompt
else:
formatted_prompt = prompt
# Tokenize
progress(0.2, desc="Tokenizing...")
inputs = tokenizer(formatted_prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
# Prepare generation config
from rnd.generation_config import RND1GenerationConfig
greedy = (temperature == 1.0)
gen_config = RND1GenerationConfig(
max_new_tokens=max_new_tokens,
num_diffusion_steps=num_steps,
mask_token_id=151669,
temperature=temperature if not greedy else 1.0,
top_k=top_k if top_k > 0 else None,
top_p=top_p if top_p > 0 else None,
greedy=greedy,
eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
)
# Generate
progress(0.3, desc=f"Generating ({num_steps} diffusion steps)...")
generator = torch.Generator(device=device)
generator.manual_seed(seed)
with torch.no_grad():
output = model.generate(
inputs=input_ids,
generation_config=gen_config,
generator=generator,
)
progress(0.9, desc="Decoding...")
# Decode generated tokens
generated_tokens = output[0][len(input_ids[0]):]
generation = tokenizer.decode(
generated_tokens.tolist(),
skip_special_tokens=True
)
progress(1.0, desc="Complete!")
return generation
# Create Gradio interface
def create_interface():
with gr.Blocks(title="RND1 Diffusion Language Model", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🌊 RND1 Diffusion Language Model
Generate text using a diffusion-based language model. The model uses iterative denoising
to progressively refine masked tokens into coherent text.
**Note:** First generation may take longer as the model loads.
""")
with gr.Row():
with gr.Column(scale=1):
prompt = gr.Textbox(
label="Prompt",
placeholder="Enter your prompt here...",
lines=4,
value="Write a Python function that finds the longest common subsequence of two strings."
)
mode = gr.Radio(
choices=["task", "completion"],
value="task",
label="Generation Mode",
info="Task: Q&A format for instructions | Completion: Continue the text"
)
with gr.Accordion("Generation Settings", open=True):
num_steps = gr.Slider(
minimum=16,
maximum=512,
value=256,
step=16,
label="Diffusion Steps",
info="More steps = better quality but slower"
)
max_new_tokens = gr.Slider(
minimum=32,
maximum=512,
value=256,
step=32,
label="Max New Tokens"
)
with gr.Accordion("Sampling Parameters", open=False):
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature",
info="1.0 = greedy/deterministic"
)
top_k = gr.Slider(
minimum=0,
maximum=100,
value=0,
step=1,
label="Top-K",
info="0 to disable"
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.05,
label="Top-P (Nucleus)",
info="0 to disable"
)
seed = gr.Slider(
minimum=0,
maximum=100000,
value=12345,
step=1,
label="Random Seed"
)
generate_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
with gr.Column(scale=1):
output = gr.Textbox(
label="Generated Text",
lines=20,
show_copy_button=True
)
gr.Markdown("""
### Examples
Try these prompts to see what the model can do!
""")
gr.Examples(
examples=[
["Write a Python function that finds the longest common subsequence of two strings.", "task", 256, 256, 1.0, 0, 0.0, 12345],
["Explain the concept of recursion with a simple example.", "task", 256, 200, 1.0, 0, 0.0, 42],
["The key to understanding quantum computing lies in", "completion", 256, 256, 1.0, 0, 0.0, 9876],
["Once upon a time in a distant galaxy,", "completion", 256, 300, 1.0, 0, 0.0, 7777],
],
inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed],
outputs=output,
fn=generate_text,
cache_examples=False,
)
generate_btn.click(
fn=generate_text,
inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed],
outputs=output,
)
return demo
if __name__ == "__main__":
# Load model at startup
load_model()
# Launch Gradio interface
demo = create_interface()
demo.queue(max_size=10) # Enable queue for ZeroGPU
demo.launch()