Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +4 -0
README.md +156 -0
README_from_modelscope.md +189 -0
assets/1.png +3 -0
assets/1_mask.png +0 -0
assets/2.png +3 -0
assets/2_mask.png +0 -0
assets/3.png +3 -0
assets/3_mask.png +0 -0
configuration.json +1 -0
model.safetensors +3 -0
title_image.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/1.png filter=lfs diff=lfs merge=lfs -text
+assets/2.png filter=lfs diff=lfs merge=lfs -text
+assets/3.png filter=lfs diff=lfs merge=lfs -text
+title_image.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,156 @@

+---
+license: apache-2.0
+---
+# Qwen-Image EliGen Precise Region Control Model - E-commerce Poster
+![](./title_image.png)
+## Model Introduction
+This model is jointly developed and open-sourced by the DiffSynth-Studio team from ModelScope and the Taotian Experience Design team.
+Built upon [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), this model is specifically designed for e-commerce poster generation and supports precise regional layout control. Using a LoRA architecture, users can flexibly control the position and shape of each entity in the poster by providing text descriptions along with corresponding region masks. The model is trained using the [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) framework and further fine-tuned on poster image data based on [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2), significantly enhancing its ability to control poster layouts.
+## Result Demonstration
+|Entity Control Condition|Generated Image|
+|-|-|
+|![image1](./assets/1_mask.png)|![mask1](./assets/1.png)|
+|![image1](./assets/2_mask.png)|![mask1](./assets/2.png)|
+|![image1](./assets/3_mask.png)|![mask1](./assets/3.png)|
+## Inference Code
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+```python
+from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from modelscope import dataset_snapshot_download, snapshot_download
+import random
+def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
+    # Create a blank image for overlays
+    overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
+    colors = [
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+    ]
+    # Generate random colors for each mask
+    if use_random_colors:
+        colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
+```
+# Font settings
+    try:
+        font = ImageFont.truetype("wqy-zenhei.ttc", font_size)  # Adjust as needed
+    except IOError:
+        font = ImageFont.load_default(font_size)
+    # Overlay each mask onto the overlay image
+    for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
+        # Convert mask to RGBA mode
+        mask_rgba = mask.convert('RGBA')
+        mask_data = mask_rgba.getdata()
+        new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
+        mask_rgba.putdata(new_data)
+        # Draw the mask prompt text on the mask
+        draw = ImageDraw.Draw(mask_rgba)
+        mask_bbox = mask.getbbox()  # Get the bounding box of the mask
+        text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10)  # Adjust text position based on mask position
+        draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
+        # Alpha composite the overlay with this mask
+        overlay = Image.alpha_composite(overlay, mask_rgba)
+    # Composite the overlay onto the original image
+    result = Image.alpha_composite(image.convert('RGBA'), overlay)
+    # Save or display the resulting image
+    result.save(output_path)
+    return result
+def example(pipe, seeds, example_id, global_prompt, entity_prompts, height=784, width=1280):
+    dataset_snapshot_download(
+        dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+        local_dir="./",
+        allow_file_pattern=f"data/examples/eligen/poster/example_{example_id}/*.png"
+    )
+    masks = [
+        Image.open(f"./data/examples/eligen/poster/example_{example_id}/{i}.png").convert('RGB').resize((width, height))
+        for i in range(len(entity_prompts))
+    ]
+    negative_prompt = "grid, regular grid, blur, low resolution, low quality, distortion, malformed, incorrect anatomy, deformed hands, deformed body, deformed face, deformed hair, deformed eyes, deformed mouth"
+    for seed in seeds:
+        # generate image
+        image = pipe(
+            prompt=global_prompt,
+            cfg_scale=4.0,
+            negative_prompt=negative_prompt,
+            num_inference_steps=40,
+            seed=seed,
+            height=height,
+            width=width,
+            eligen_entity_prompts=entity_prompts,
+            eligen_entity_masks=masks,
+        )
+        image.save(f"eligen_poster_example_{example_id}_{seed}.png")
+        image = Image.new("RGB", (width, height), (0, 0, 0))
+        visualize_masks(image, masks, entity_prompts, f"eligen_poster_example_{example_id}_mask_{seed}.png")
+```python
+pipe = QwenImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
+)
+snapshot_download(
+    "DiffSynth-Studio/Qwen-Image-EliGen-Poster",
+    local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-Poster",
+    allow_file_pattern="model.safetensors",
+)
+pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-Poster/model.safetensors")
+global_prompt = "A poster with a soft pink-purple background. On the left side, there is large pink-purple text reading \"Qwen-Image EliGen-Poster\", and inside a pink-purple oval frame, small white text reads: \"Image Generation Model with Precise Region Control\". On the right side, a little rabbit is unwrapping a gift, with a cartoon-style baby dragon standing beside it, wearing a mini firework launcher on its head. The background is dotted with some white clouds. The overall style is cute and cartoonish, conveying a festive and surprising theme."
+entity_prompts = ["pink-purple text \"Qwen-Image EliGen-Poster\"", "small white text inside a pink-purple oval frame: \"Image Generation Model with Precise Region Control\"", "a little rabbit unwrapping a gift, with a cartoon-style baby dragon wearing a mini firework launcher standing beside it"]
+seed = [42]
+example(pipe, seed, 1, global_prompt, entity_prompts)
+```
+## Citation
+If you find our work helpful, please consider citing our paper:
+```
+@article{zhang2025eligen,
+  title={Eligen: Entity-level controlled image generation with regional attention},
+  author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
+  journal={arXiv preprint arXiv:2501.01097},
+  year={2025}
+}
+```

README_from_modelscope.md ADDED Viewed

	@@ -0,0 +1,189 @@

+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tasks:
+- text-to-image-synthesis
+#model-type:
+##如 gpt、phi、llama、chatglm、baichuan 等
+#- gpt
+#domain:
+##如 nlp、cv、audio、multi-modal
+#- nlp
+#language:
+##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
+#- cn
+#metrics:
+##如 CIDEr、Blue、ROUGE 等
+#- CIDEr
+#tags:
+##各种自定义，包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
+#- pretrained
+#tools:
+##如 vllm、fastchat、llamacpp、AdaSeq 等
+#- vllm
+base_model:
+  - Qwen/Qwen-Image
+  - DiffSynth-Studio/Eligen
+base_model_relation: adapter
+---
+# Qwen-Image EliGen 精确分区控制模型-电商海报
+![](./title_image.png)
+## 模型介绍
+本模型由魔搭社区 DiffSynth-Studio 团队与淘天体验设计团队联合研发并开源。
+模型基于 [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 构建，专为电商海报场景设计，支持精确的分区布局控制。采用 LoRA 架构，用户可通过输入各实体的文本描述及其对应的区域掩码，灵活控制其在海报中的位置与形状。模型基于 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 框架训练，在 [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) 基础上，进一步针对海报图像数据进行了微调优化，显著提升海报版式控制能力。
+## 效果展示
+|实体控制条件|生成图|
+|-|-|
+|![image1](./assets/1_mask.png)|![mask1](./assets/1.png)|
+|![image1](./assets/2_mask.png)|![mask1](./assets/2.png)|
+|![image1](./assets/3_mask.png)|![mask1](./assets/3.png)|
+## 推理代码
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+```python
+from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from modelscope import dataset_snapshot_download, snapshot_download
+import random
+def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
+    # Create a blank image for overlays
+    overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
+    colors = [
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+    ]
+    # Generate random colors for each mask
+    if use_random_colors:
+        colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
+    # Font settings
+    try:
+        font = ImageFont.truetype("wqy-zenhei.ttc", font_size)  # Adjust as needed
+    except IOError:
+        font = ImageFont.load_default(font_size)
+    # Overlay each mask onto the overlay image
+    for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
+        # Convert mask to RGBA mode
+        mask_rgba = mask.convert('RGBA')
+        mask_data = mask_rgba.getdata()
+        new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
+        mask_rgba.putdata(new_data)
+        # Draw the mask prompt text on the mask
+        draw = ImageDraw.Draw(mask_rgba)
+        mask_bbox = mask.getbbox()  # Get the bounding box of the mask
+        text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10)  # Adjust text position based on mask position
+        draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
+        # Alpha composite the overlay with this mask
+        overlay = Image.alpha_composite(overlay, mask_rgba)
+    # Composite the overlay onto the original image
+    result = Image.alpha_composite(image.convert('RGBA'), overlay)
+    # Save or display the resulting image
+    result.save(output_path)
+    return result
+def example(pipe, seeds, example_id, global_prompt, entity_prompts, height=784, width=1280):
+    dataset_snapshot_download(
+        dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+        local_dir="./",
+        allow_file_pattern=f"data/examples/eligen/poster/example_{example_id}/*.png"
+    )
+    masks = [
+        Image.open(f"./data/examples/eligen/poster/example_{example_id}/{i}.png").convert('RGB').resize((width, height))
+        for i in range(len(entity_prompts))
+    ]
+    negative_prompt = "网格化，规则的网格，模糊, 低分辨率, 低质量, 变形, 畸形, 错误的解剖学, 变形的手, 变形的身体, 变形的脸, 变形的头发, 变形的眼睛, 变形的嘴巴"
+    for seed in seeds:
+        # generate image
+        image = pipe(
+            prompt=global_prompt,
+            cfg_scale=4.0,
+            negative_prompt=negative_prompt,
+            num_inference_steps=40,
+            seed=seed,
+            height=height,
+            width=width,
+            eligen_entity_prompts=entity_prompts,
+            eligen_entity_masks=masks,
+        )
+        image.save(f"eligen_poster_example_{example_id}_{seed}.png")
+        image = Image.new("RGB", (width, height), (0, 0, 0))
+        visualize_masks(image, masks, entity_prompts, f"eligen_poster_example_{example_id}_mask_{seed}.png")
+pipe = QwenImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
+        ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
+)
+snapshot_download(
+    "DiffSynth-Studio/Qwen-Image-EliGen-Poster",
+    local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-Poster",
+    allow_file_pattern="model.safetensors",
+)
+pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-Poster/model.safetensors")
+global_prompt = "一张以柔粉紫为背景的海报，左侧有大号粉紫色文字“Qwen-Image EliGen-Poster”，粉紫色椭圆框内白色小字：“图像精确分区控制模型”。右侧有一只小兔子在拆礼物，旁边站着一只头顶迷你烟花发射器的小龙（卡通Q版）。背景有一些白云点缀。整体风格卡通可爱，传达节日惊喜的主题。"
+entity_prompts = ["粉紫色文字“Qwen-Image EliGen-Poster”", "粉紫色椭圆框内白色小字：“图像精确分区控制模型”", "一只小兔子在拆礼物，小兔子旁边站着一只头顶迷你烟花发射器的小龙（卡通Q版）"]
+seed = [42]
+example(pipe, seed, 1, global_prompt, entity_prompts)
+```
+## 引用
+如果您觉得我们的工作对您有所帮助，欢迎引用我们的成果。
+```
+@article{zhang2025eligen,
+  title={Eligen: Entity-level controlled image generation with regional attention},
+  author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
+  journal={arXiv preprint arXiv:2501.01097},
+  year={2025}
+}
+```