import gradio as gr import torch from transformers import AutoModel, AutoTokenizer import spaces import os import tempfile # Load model and tokenizer model_name = "deepseek-ai/DeepSeek-OCR" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained( model_name, _attn_implementation="flash_attention_2", trust_remote_code=True, use_safetensors=True, ) model = model.eval().to(torch.bfloat16) @spaces.GPU def process_image(image, model_size, task_type): """ Process image with DeepSeek-OCR Args: image: PIL Image or file path model_size: Model size configuration task_type: OCR task type """ # Create temporary directory for output with tempfile.TemporaryDirectory() as output_path: # Set prompt based on task type if task_type == "Free OCR": prompt = "\nFree OCR. " elif task_type == "Convert to Markdown": prompt = "\n<|grounding|>Convert the document to markdown. " elif task_type == "Extract Text": prompt = "\nExtract all text from the image. " else: prompt = "\nFree OCR. " # Save uploaded image temporarily temp_image_path = os.path.join(output_path, "temp_image.jpg") image.save(temp_image_path) # Configure model size parameters size_configs = { "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, "Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, "Gundam (Recommended)": { "base_size": 1024, "image_size": 640, "crop_mode": True, }, } config = size_configs.get(model_size, size_configs["Gundam (Recommended)"]) # Run inference result = model.infer( tokenizer, prompt=prompt, image_file=temp_image_path, output_path=output_path, base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"], save_results=True, test_compress=True, ) return result # Create Gradio interface with gr.Blocks(title="DeepSeek-OCR") as demo: gr.Markdown( """ # DeepSeek-OCR Document Recognition Upload an image to extract text using DeepSeek-OCR model. Supports various document types and handwriting recognition. **Model Sizes:** - **Tiny**: Fastest, lower accuracy (512x512) - **Small**: Fast, good accuracy (640x640) - **Base**: Balanced performance (1024x1024) - **Large**: Best accuracy, slower (1280x1280) - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode) """ ) with gr.Row(): with gr.Column(): image_input = gr.Image( type="pil", label="Upload Image", sources=["upload", "clipboard"] ) model_size = gr.Dropdown( choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="Model Size", ) task_type = gr.Dropdown( choices=["Free OCR", "Convert to Markdown", "Extract Text"], value="Convert to Markdown", label="Task Type", ) submit_btn = gr.Button("Process Image", variant="primary") with gr.Column(): output_text = gr.Textbox( label="OCR Result", lines=20, show_copy_button=True ) # Examples gr.Examples( examples=[ ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"], ["examples/receipt.jpg", "Base", "Free OCR"], ], inputs=[image_input, model_size, task_type], outputs=output_text, fn=process_image, cache_examples=False, ) submit_btn.click( fn=process_image, inputs=[image_input, model_size, task_type], outputs=output_text, ) # Launch the app if __name__ == "__main__": demo.queue(max_size=20) demo.launch()