import torch
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL, DDPMScheduler
from transformers import CLIPTextModel, CLIPImageProcessor, AutoTokenizer

# Load the fine-tuned models
vae = AutoencoderKL.from_pretrained("./Model/finetuned_vae_v1_150_epoch_9")
unet = UNet2DConditionModel.from_pretrained("./Model/finetuned_crosswalk_model_v1_150_epoch_9")

scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

# Load the CLIP text encoder, tokenizer, and feature extractor
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Load the fine-tuned Stable Diffusion pipeline
pipeline = StableDiffusionPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    feature_extractor=feature_extractor,  
    safety_checker=None,  
)

# Move the pipeline to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print ("Working with: ",device)
pipeline.to(device)

# Generate an image from a text prompt
prompt = "a crosswalk image"  # Customize your prompt here
with torch.amp.autocast('cuda'):
        image = pipeline(prompt, num_inference_steps=50, guidance_scale=9).images[0]

# Save or show the generated image
image.resize((640,360)).save("output.png")
image.resize((640,360)).show()