Safetensors
kelseye commited on
Commit
6ddcbda
·
verified ·
1 Parent(s): db57de4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/1.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/2.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/3.png filter=lfs diff=lfs merge=lfs -text
39
+ title_image.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ # Qwen-Image EliGen Precise Region Control Model - E-commerce Poster
5
+
6
+ ![](./title_image.png)
7
+
8
+ ## Model Introduction
9
+
10
+ This model is jointly developed and open-sourced by the DiffSynth-Studio team from ModelScope and the Taotian Experience Design team.
11
+
12
+ Built upon [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), this model is specifically designed for e-commerce poster generation and supports precise regional layout control. Using a LoRA architecture, users can flexibly control the position and shape of each entity in the poster by providing text descriptions along with corresponding region masks. The model is trained using the [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) framework and further fine-tuned on poster image data based on [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2), significantly enhancing its ability to control poster layouts.
13
+
14
+ ## Result Demonstration
15
+
16
+ |Entity Control Condition|Generated Image|
17
+ |-|-|
18
+ |![image1](./assets/1_mask.png)|![mask1](./assets/1.png)|
19
+ |![image1](./assets/2_mask.png)|![mask1](./assets/2.png)|
20
+ |![image1](./assets/3_mask.png)|![mask1](./assets/3.png)|
21
+
22
+
23
+ ## Inference Code
24
+ ```
25
+ git clone https://github.com/modelscope/DiffSynth-Studio.git
26
+ cd DiffSynth-Studio
27
+ pip install -e .
28
+ ```
29
+
30
+ ```python
31
+ from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
32
+ import torch
33
+ from PIL import Image, ImageDraw, ImageFont
34
+ from modelscope import dataset_snapshot_download, snapshot_download
35
+ import random
36
+
37
+
38
+ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
39
+ # Create a blank image for overlays
40
+ overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
41
+
42
+ colors = [
43
+ (165, 238, 173, 80),
44
+ (76, 102, 221, 80),
45
+ (221, 160, 77, 80),
46
+ (204, 93, 71, 80),
47
+ (145, 187, 149, 80),
48
+ (134, 141, 172, 80),
49
+ (157, 137, 109, 80),
50
+ (153, 104, 95, 80),
51
+ (165, 238, 173, 80),
52
+ (76, 102, 221, 80),
53
+ (221, 160, 77, 80),
54
+ (204, 93, 71, 80),
55
+ (145, 187, 149, 80),
56
+ (134, 141, 172, 80),
57
+ (157, 137, 109, 80),
58
+ (153, 104, 95, 80),
59
+ ]
60
+ # Generate random colors for each mask
61
+ if use_random_colors:
62
+ colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
63
+ ```
64
+
65
+ # Font settings
66
+ try:
67
+ font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed
68
+ except IOError:
69
+ font = ImageFont.load_default(font_size)
70
+
71
+ # Overlay each mask onto the overlay image
72
+ for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
73
+ # Convert mask to RGBA mode
74
+ mask_rgba = mask.convert('RGBA')
75
+ mask_data = mask_rgba.getdata()
76
+ new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
77
+ mask_rgba.putdata(new_data)
78
+
79
+ # Draw the mask prompt text on the mask
80
+ draw = ImageDraw.Draw(mask_rgba)
81
+ mask_bbox = mask.getbbox() # Get the bounding box of the mask
82
+ text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10) # Adjust text position based on mask position
83
+ draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
84
+
85
+ # Alpha composite the overlay with this mask
86
+ overlay = Image.alpha_composite(overlay, mask_rgba)
87
+
88
+ # Composite the overlay onto the original image
89
+ result = Image.alpha_composite(image.convert('RGBA'), overlay)
90
+
91
+ # Save or display the resulting image
92
+ result.save(output_path)
93
+
94
+ return result
95
+
96
+ def example(pipe, seeds, example_id, global_prompt, entity_prompts, height=784, width=1280):
97
+ dataset_snapshot_download(
98
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
99
+ local_dir="./",
100
+ allow_file_pattern=f"data/examples/eligen/poster/example_{example_id}/*.png"
101
+ )
102
+ masks = [
103
+ Image.open(f"./data/examples/eligen/poster/example_{example_id}/{i}.png").convert('RGB').resize((width, height))
104
+ for i in range(len(entity_prompts))
105
+ ]
106
+ negative_prompt = "grid, regular grid, blur, low resolution, low quality, distortion, malformed, incorrect anatomy, deformed hands, deformed body, deformed face, deformed hair, deformed eyes, deformed mouth"
107
+ for seed in seeds:
108
+ # generate image
109
+ image = pipe(
110
+ prompt=global_prompt,
111
+ cfg_scale=4.0,
112
+ negative_prompt=negative_prompt,
113
+ num_inference_steps=40,
114
+ seed=seed,
115
+ height=height,
116
+ width=width,
117
+ eligen_entity_prompts=entity_prompts,
118
+ eligen_entity_masks=masks,
119
+ )
120
+ image.save(f"eligen_poster_example_{example_id}_{seed}.png")
121
+ image = Image.new("RGB", (width, height), (0, 0, 0))
122
+ visualize_masks(image, masks, entity_prompts, f"eligen_poster_example_{example_id}_mask_{seed}.png")
123
+
124
+ ```python
125
+ pipe = QwenImagePipeline.from_pretrained(
126
+ torch_dtype=torch.bfloat16,
127
+ device="cuda",
128
+ model_configs=[
129
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
130
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
131
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
132
+ ],
133
+ tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
134
+ )
135
+ snapshot_download(
136
+ "DiffSynth-Studio/Qwen-Image-EliGen-Poster",
137
+ local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-Poster",
138
+ allow_file_pattern="model.safetensors",
139
+ )
140
+ pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-Poster/model.safetensors")
141
+ global_prompt = "A poster with a soft pink-purple background. On the left side, there is large pink-purple text reading \"Qwen-Image EliGen-Poster\", and inside a pink-purple oval frame, small white text reads: \"Image Generation Model with Precise Region Control\". On the right side, a little rabbit is unwrapping a gift, with a cartoon-style baby dragon standing beside it, wearing a mini firework launcher on its head. The background is dotted with some white clouds. The overall style is cute and cartoonish, conveying a festive and surprising theme."
142
+ entity_prompts = ["pink-purple text \"Qwen-Image EliGen-Poster\"", "small white text inside a pink-purple oval frame: \"Image Generation Model with Precise Region Control\"", "a little rabbit unwrapping a gift, with a cartoon-style baby dragon wearing a mini firework launcher standing beside it"]
143
+ seed = [42]
144
+ example(pipe, seed, 1, global_prompt, entity_prompts)
145
+ ```
146
+
147
+ ## Citation
148
+ If you find our work helpful, please consider citing our paper:
149
+ ```
150
+ @article{zhang2025eligen,
151
+ title={Eligen: Entity-level controlled image generation with regional attention},
152
+ author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
153
+ journal={arXiv preprint arXiv:2501.01097},
154
+ year={2025}
155
+ }
156
+ ```
README_from_modelscope.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: Apache License 2.0
5
+ tasks:
6
+ - text-to-image-synthesis
7
+
8
+ #model-type:
9
+ ##如 gpt、phi、llama、chatglm、baichuan 等
10
+ #- gpt
11
+
12
+ #domain:
13
+ ##如 nlp、cv、audio、multi-modal
14
+ #- nlp
15
+
16
+ #language:
17
+ ##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
18
+ #- cn
19
+
20
+ #metrics:
21
+ ##如 CIDEr、Blue、ROUGE 等
22
+ #- CIDEr
23
+
24
+ #tags:
25
+ ##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
26
+ #- pretrained
27
+
28
+ #tools:
29
+ ##如 vllm、fastchat、llamacpp、AdaSeq 等
30
+ #- vllm
31
+ base_model:
32
+ - Qwen/Qwen-Image
33
+ - DiffSynth-Studio/Eligen
34
+ base_model_relation: adapter
35
+ ---
36
+ # Qwen-Image EliGen 精确分区控制模型-电商海报
37
+
38
+ ![](./title_image.png)
39
+
40
+ ## 模型介绍
41
+
42
+ 本模型由魔搭社区 DiffSynth-Studio 团队与淘天体验设计团队联合研发并开源。
43
+
44
+ 模型基于 [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 构建,专为电商海报场景设计,支持精确的分区布局控制。采用 LoRA 架构,用户可通过输入各实体的文本描述及其对应的区域掩码,灵活控制其在海报中的位置与形状。模型基于 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 框架训练,在 [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) 基础上,进一步针对海报图像数据进行了微调优化,显著提升海报版式控制能力。
45
+
46
+ ## 效果展示
47
+
48
+ |实体控制条件|生成图|
49
+ |-|-|
50
+ |![image1](./assets/1_mask.png)|![mask1](./assets/1.png)|
51
+ |![image1](./assets/2_mask.png)|![mask1](./assets/2.png)|
52
+ |![image1](./assets/3_mask.png)|![mask1](./assets/3.png)|
53
+
54
+
55
+ ## 推理代码
56
+ ```
57
+ git clone https://github.com/modelscope/DiffSynth-Studio.git
58
+ cd DiffSynth-Studio
59
+ pip install -e .
60
+ ```
61
+
62
+ ```python
63
+ from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
64
+ import torch
65
+ from PIL import Image, ImageDraw, ImageFont
66
+ from modelscope import dataset_snapshot_download, snapshot_download
67
+ import random
68
+
69
+
70
+ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
71
+ # Create a blank image for overlays
72
+ overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
73
+
74
+ colors = [
75
+ (165, 238, 173, 80),
76
+ (76, 102, 221, 80),
77
+ (221, 160, 77, 80),
78
+ (204, 93, 71, 80),
79
+ (145, 187, 149, 80),
80
+ (134, 141, 172, 80),
81
+ (157, 137, 109, 80),
82
+ (153, 104, 95, 80),
83
+ (165, 238, 173, 80),
84
+ (76, 102, 221, 80),
85
+ (221, 160, 77, 80),
86
+ (204, 93, 71, 80),
87
+ (145, 187, 149, 80),
88
+ (134, 141, 172, 80),
89
+ (157, 137, 109, 80),
90
+ (153, 104, 95, 80),
91
+ ]
92
+ # Generate random colors for each mask
93
+ if use_random_colors:
94
+ colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
95
+
96
+ # Font settings
97
+ try:
98
+ font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed
99
+ except IOError:
100
+ font = ImageFont.load_default(font_size)
101
+
102
+ # Overlay each mask onto the overlay image
103
+ for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
104
+ # Convert mask to RGBA mode
105
+ mask_rgba = mask.convert('RGBA')
106
+ mask_data = mask_rgba.getdata()
107
+ new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
108
+ mask_rgba.putdata(new_data)
109
+
110
+ # Draw the mask prompt text on the mask
111
+ draw = ImageDraw.Draw(mask_rgba)
112
+ mask_bbox = mask.getbbox() # Get the bounding box of the mask
113
+ text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10) # Adjust text position based on mask position
114
+ draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
115
+
116
+ # Alpha composite the overlay with this mask
117
+ overlay = Image.alpha_composite(overlay, mask_rgba)
118
+
119
+ # Composite the overlay onto the original image
120
+ result = Image.alpha_composite(image.convert('RGBA'), overlay)
121
+
122
+ # Save or display the resulting image
123
+ result.save(output_path)
124
+
125
+ return result
126
+
127
+
128
+ def example(pipe, seeds, example_id, global_prompt, entity_prompts, height=784, width=1280):
129
+ dataset_snapshot_download(
130
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
131
+ local_dir="./",
132
+ allow_file_pattern=f"data/examples/eligen/poster/example_{example_id}/*.png"
133
+ )
134
+ masks = [
135
+ Image.open(f"./data/examples/eligen/poster/example_{example_id}/{i}.png").convert('RGB').resize((width, height))
136
+ for i in range(len(entity_prompts))
137
+ ]
138
+ negative_prompt = "网格化,规则的网格,模糊, 低分辨率, 低质量, 变形, 畸形, 错误的解剖学, 变形的手, 变形的身体, 变形的脸, 变形的头发, 变形的眼睛, 变形的嘴巴"
139
+ for seed in seeds:
140
+ # generate image
141
+ image = pipe(
142
+ prompt=global_prompt,
143
+ cfg_scale=4.0,
144
+ negative_prompt=negative_prompt,
145
+ num_inference_steps=40,
146
+ seed=seed,
147
+ height=height,
148
+ width=width,
149
+ eligen_entity_prompts=entity_prompts,
150
+ eligen_entity_masks=masks,
151
+ )
152
+ image.save(f"eligen_poster_example_{example_id}_{seed}.png")
153
+ image = Image.new("RGB", (width, height), (0, 0, 0))
154
+ visualize_masks(image, masks, entity_prompts, f"eligen_poster_example_{example_id}_mask_{seed}.png")
155
+
156
+
157
+ pipe = QwenImagePipeline.from_pretrained(
158
+ torch_dtype=torch.bfloat16,
159
+ device="cuda",
160
+ model_configs=[
161
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
162
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
163
+ ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
164
+ ],
165
+ tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
166
+ )
167
+ snapshot_download(
168
+ "DiffSynth-Studio/Qwen-Image-EliGen-Poster",
169
+ local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-Poster",
170
+ allow_file_pattern="model.safetensors",
171
+ )
172
+ pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-Poster/model.safetensors")
173
+ global_prompt = "一张以柔粉紫为背景的海报,左侧有大号粉紫色文字“Qwen-Image EliGen-Poster”,粉紫色椭圆框内白色小字:“图像精确分区控制模型”。右侧有一只小兔子在拆礼物,旁边站着一只头顶迷你烟花发射器的小龙(卡通Q版)。背景有一些白云点缀。整体风格卡通可爱,传达节日惊喜的主题。"
174
+ entity_prompts = ["粉紫色文字“Qwen-Image EliGen-Poster”", "粉紫色椭圆框内白色小字:“图像精确分区控制模型”", "一只小兔子在拆礼物,小兔子旁边站着一只头顶迷你烟花发射器的小龙(卡通Q版)"]
175
+ seed = [42]
176
+ example(pipe, seed, 1, global_prompt, entity_prompts)
177
+
178
+ ```
179
+
180
+ ## 引用
181
+ 如果您觉得我们的工作对您有所帮助,欢迎引用我们的成果。
182
+ ```
183
+ @article{zhang2025eligen,
184
+ title={Eligen: Entity-level controlled image generation with regional attention},
185
+ author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
186
+ journal={arXiv preprint arXiv:2501.01097},
187
+ year={2025}
188
+ }
189
+ ```
assets/1.png ADDED

Git LFS Details

  • SHA256: 5092d61025250c67fd008964e4ce5c14b2f8422e6910e1594f0065480e530667
  • Pointer size: 131 Bytes
  • Size of remote file: 445 kB
assets/1_mask.png ADDED
assets/2.png ADDED

Git LFS Details

  • SHA256: d9f51c8821bf7b8bae5a075abc54c68b1f308b02c2605305b2907a451c9370d4
  • Pointer size: 131 Bytes
  • Size of remote file: 607 kB
assets/2_mask.png ADDED
assets/3.png ADDED

Git LFS Details

  • SHA256: 18a423adb2ade0433dcb964a8c05bd2f4e786abec24d4bb07a9ec01d2e0513f2
  • Pointer size: 131 Bytes
  • Size of remote file: 668 kB
assets/3_mask.png ADDED
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-to-image-synthesis"}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c412937571591548932872cf567a281277cb1545c4b79968ad735a199605405
3
+ size 472047184
title_image.png ADDED

Git LFS Details

  • SHA256: 7ee2c51469c25a8cf5d6fb904838ed545ff11f8a10ebb3526e0b0a03f46d5c8e
  • Pointer size: 131 Bytes
  • Size of remote file: 812 kB