Safetensors

Qwen-Image EliGen Precise Region Control Model - E-commerce Poster

Model Introduction

This model is jointly developed and open-sourced by the DiffSynth-Studio team from ModelScope and the Taotian Experience Design team.

Built upon Qwen-Image, this model is specifically designed for e-commerce poster generation and supports precise regional layout control. Using a LoRA architecture, users can flexibly control the position and shape of each entity in the poster by providing text descriptions along with corresponding region masks. The model is trained using the DiffSynth-Studio framework and further fine-tuned on poster image data based on DiffSynth-Studio/Qwen-Image-EliGen-V2, significantly enhancing its ability to control poster layouts.

Result Demonstration

Entity Control Condition Generated Image
image1 mask1
image1 mask1
image1 mask1

Inference Code

git clone https://github.com/modelscope/DiffSynth-Studio.git  
cd DiffSynth-Studio
pip install -e .
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
import torch
from PIL import Image, ImageDraw, ImageFont
from modelscope import dataset_snapshot_download, snapshot_download
import random


def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
    # Create a blank image for overlays
    overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))

    colors = [
        (165, 238, 173, 80),
        (76, 102, 221, 80),
        (221, 160, 77, 80),
        (204, 93, 71, 80),
        (145, 187, 149, 80),
        (134, 141, 172, 80),
        (157, 137, 109, 80),
        (153, 104, 95, 80),
        (165, 238, 173, 80),
        (76, 102, 221, 80),
        (221, 160, 77, 80),
        (204, 93, 71, 80),
        (145, 187, 149, 80),
        (134, 141, 172, 80),
        (157, 137, 109, 80),
        (153, 104, 95, 80),
    ]
    # Generate random colors for each mask
    if use_random_colors:
        colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]

Font settings

try:
    font = ImageFont.truetype("wqy-zenhei.ttc", font_size)  # Adjust as needed
except IOError:
    font = ImageFont.load_default(font_size)

# Overlay each mask onto the overlay image
for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
    # Convert mask to RGBA mode
    mask_rgba = mask.convert('RGBA')
    mask_data = mask_rgba.getdata()
    new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
    mask_rgba.putdata(new_data)

    # Draw the mask prompt text on the mask
    draw = ImageDraw.Draw(mask_rgba)
    mask_bbox = mask.getbbox()  # Get the bounding box of the mask
    text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10)  # Adjust text position based on mask position
    draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)

    # Alpha composite the overlay with this mask
    overlay = Image.alpha_composite(overlay, mask_rgba)

# Composite the overlay onto the original image
result = Image.alpha_composite(image.convert('RGBA'), overlay)

# Save or display the resulting image
result.save(output_path)

return result

def example(pipe, seeds, example_id, global_prompt, entity_prompts, height=784, width=1280): dataset_snapshot_download( dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/poster/example_{example_id}/*.png" ) masks = [ Image.open(f"./data/examples/eligen/poster/example_{example_id}/{i}.png").convert('RGB').resize((width, height)) for i in range(len(entity_prompts)) ] negative_prompt = "grid, regular grid, blur, low resolution, low quality, distortion, malformed, incorrect anatomy, deformed hands, deformed body, deformed face, deformed hair, deformed eyes, deformed mouth" for seed in seeds: # generate image image = pipe( prompt=global_prompt, cfg_scale=4.0, negative_prompt=negative_prompt, num_inference_steps=40, seed=seed, height=height, width=width, eligen_entity_prompts=entity_prompts, eligen_entity_masks=masks, ) image.save(f"eligen_poster_example_{example_id}{seed}.png") image = Image.new("RGB", (width, height), (0, 0, 0)) visualize_masks(image, masks, entity_prompts, f"eligen_poster_example{example_id}mask{seed}.png")

pipe = QwenImagePipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
    ],
    tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
)
snapshot_download(
    "DiffSynth-Studio/Qwen-Image-EliGen-Poster",
    local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-Poster",
    allow_file_pattern="model.safetensors",
)
pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-Poster/model.safetensors")
global_prompt = "A poster with a soft pink-purple background. On the left side, there is large pink-purple text reading \"Qwen-Image EliGen-Poster\", and inside a pink-purple oval frame, small white text reads: \"Image Generation Model with Precise Region Control\". On the right side, a little rabbit is unwrapping a gift, with a cartoon-style baby dragon standing beside it, wearing a mini firework launcher on its head. The background is dotted with some white clouds. The overall style is cute and cartoonish, conveying a festive and surprising theme."
entity_prompts = ["pink-purple text \"Qwen-Image EliGen-Poster\"", "small white text inside a pink-purple oval frame: \"Image Generation Model with Precise Region Control\"", "a little rabbit unwrapping a gift, with a cartoon-style baby dragon wearing a mini firework launcher standing beside it"]
seed = [42]
example(pipe, seed, 1, global_prompt, entity_prompts)

Citation

If you find our work helpful, please consider citing our paper:

@article{zhang2025eligen,
  title={Eligen: Entity-level controlled image generation with regional attention},
  author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
  journal={arXiv preprint arXiv:2501.01097},
  year={2025}
}
Downloads last month

-

Downloads are not tracked for this model. How to track
Safetensors
Model size
0.2B params
Tensor type
BF16
ยท
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support