import torch from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL, DDPMScheduler from transformers import CLIPTextModel, CLIPImageProcessor, AutoTokenizer # Load the fine-tuned models vae = AutoencoderKL.from_pretrained("./Model/finetuned_vae_v1_150_epoch_9") unet = UNet2DConditionModel.from_pretrained("./Model/finetuned_crosswalk_model_v1_150_epoch_9") scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") # Load the CLIP text encoder, tokenizer, and feature extractor tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14") text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") # Load the fine-tuned Stable Diffusion pipeline pipeline = StableDiffusionPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler, feature_extractor=feature_extractor, safety_checker=None, ) # Move the pipeline to GPU (if available) device = "cuda" if torch.cuda.is_available() else "cpu" print ("Working with: ",device) pipeline.to(device) # Generate an image from a text prompt prompt = "a crosswalk image" # Customize your prompt here with torch.amp.autocast('cuda'): image = pipeline(prompt, num_inference_steps=50, guidance_scale=9).images[0] # Save or show the generated image image.resize((640,360)).save("output.png") image.resize((640,360)).show()