#!/usr/bin/env python3 """ RND1 Diffusion Model Demo for Hugging Face Spaces with ZeroGPU """ import torch import gradio as gr import spaces import random import numpy as np from transformers import AutoTokenizer # Global model and tokenizer model = None tokenizer = None device = "cuda" def set_seed(seed: int): """Set random seed for reproducibility.""" random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def load_model(): """Load model and tokenizer (called once at startup).""" global model, tokenizer from rnd.configuration_rnd import RND1Config from rnd.modeling_rnd import RND1LM model_path = "radicalnumerics/RND1-Base-0910" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print("Loading model...") cfg = RND1Config.from_pretrained(model_path) cfg.model_type = "rnd1" cfg.attn_implementation = "sdpa" cfg.moe_backend = "hf" model = RND1LM.from_pretrained( model_path, config=cfg, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, use_safetensors=True, low_cpu_mem_usage=True, ) model.eval() print("Model loaded successfully!") @spaces.GPU(duration=120) # Request GPU for up to 120 seconds def generate_text( prompt: str, mode: str, num_steps: int, max_new_tokens: int, temperature: float, top_k: int, top_p: float, seed: int, progress=gr.Progress() ): """ Generate text using RND1 diffusion model. Args: prompt: Input text prompt mode: Generation mode ('task' or 'completion') num_steps: Number of diffusion steps max_new_tokens: Maximum tokens to generate temperature: Sampling temperature top_k: Top-k filtering (0 to disable) top_p: Top-p nucleus filtering (0 to disable) seed: Random seed progress: Gradio progress tracker """ if not prompt.strip(): return "⚠️ Please enter a prompt." progress(0, desc="Setting seed...") set_seed(seed) progress(0.1, desc="Preparing prompt...") # Format prompt based on mode if mode == "task": if not prompt.strip().startswith("Question:"): formatted_prompt = f"Question: {prompt}\n" else: formatted_prompt = prompt else: formatted_prompt = prompt # Tokenize progress(0.2, desc="Tokenizing...") inputs = tokenizer(formatted_prompt, return_tensors="pt") input_ids = inputs.input_ids.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None # Prepare generation config from rnd.generation_config import RND1GenerationConfig greedy = (temperature == 1.0) gen_config = RND1GenerationConfig( max_new_tokens=max_new_tokens, num_diffusion_steps=num_steps, mask_token_id=151669, temperature=temperature if not greedy else 1.0, top_k=top_k if top_k > 0 else None, top_p=top_p if top_p > 0 else None, greedy=greedy, eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, ) # Generate progress(0.3, desc=f"Generating ({num_steps} diffusion steps)...") generator = torch.Generator(device=device) generator.manual_seed(seed) with torch.no_grad(): output = model.generate( inputs=input_ids, generation_config=gen_config, generator=generator, ) progress(0.9, desc="Decoding...") # Decode generated tokens generated_tokens = output[0][len(input_ids[0]):] generation = tokenizer.decode( generated_tokens.tolist(), skip_special_tokens=True ) progress(1.0, desc="Complete!") return generation # Create Gradio interface def create_interface(): with gr.Blocks(title="RND1 Diffusion Language Model", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🌊 RND1 Diffusion Language Model Generate text using a diffusion-based language model. The model uses iterative denoising to progressively refine masked tokens into coherent text. **Note:** First generation may take longer as the model loads. """) with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox( label="Prompt", placeholder="Enter your prompt here...", lines=4, value="Write a Python function that finds the longest common subsequence of two strings." ) mode = gr.Radio( choices=["task", "completion"], value="task", label="Generation Mode", info="Task: Q&A format for instructions | Completion: Continue the text" ) with gr.Accordion("Generation Settings", open=True): num_steps = gr.Slider( minimum=16, maximum=512, value=256, step=16, label="Diffusion Steps", info="More steps = better quality but slower" ) max_new_tokens = gr.Slider( minimum=32, maximum=512, value=256, step=32, label="Max New Tokens" ) with gr.Accordion("Sampling Parameters", open=False): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Temperature", info="1.0 = greedy/deterministic" ) top_k = gr.Slider( minimum=0, maximum=100, value=0, step=1, label="Top-K", info="0 to disable" ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Top-P (Nucleus)", info="0 to disable" ) seed = gr.Slider( minimum=0, maximum=100000, value=12345, step=1, label="Random Seed" ) generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg") with gr.Column(scale=1): output = gr.Textbox( label="Generated Text", lines=20, show_copy_button=True ) gr.Markdown(""" ### Examples Try these prompts to see what the model can do! """) gr.Examples( examples=[ ["Write a Python function that finds the longest common subsequence of two strings.", "task", 256, 256, 1.0, 0, 0.0, 12345], ["Explain the concept of recursion with a simple example.", "task", 256, 200, 1.0, 0, 0.0, 42], ["The key to understanding quantum computing lies in", "completion", 256, 256, 1.0, 0, 0.0, 9876], ["Once upon a time in a distant galaxy,", "completion", 256, 300, 1.0, 0, 0.0, 7777], ], inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed], outputs=output, fn=generate_text, cache_examples=False, ) generate_btn.click( fn=generate_text, inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed], outputs=output, ) return demo if __name__ == "__main__": # Load model at startup load_model() # Launch Gradio interface demo = create_interface() demo.queue(max_size=10) # Enable queue for ZeroGPU demo.launch()