Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- README.md +78 -0
- README_from_modelscope.md +103 -0
- assets/samples/eligen_example_4_2.png +3 -0
- assets/samples/eligen_example_4_5.png +3 -0
- assets/samples/eligen_example_4_6.png +3 -0
- assets/samples/eligen_example_4_mask_3.png +0 -0
- assets/samples/eligen_example_7_0.png +3 -0
- assets/samples/eligen_example_7_2.png +3 -0
- assets/samples/eligen_example_7_6.png +3 -0
- assets/samples/eligen_example_7_mask_3.png +0 -0
- assets/title.png +3 -0
- configuration.json +1 -0
- model.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/samples/eligen_example_4_2.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/samples/eligen_example_4_5.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/samples/eligen_example_4_6.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/samples/eligen_example_7_0.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/samples/eligen_example_7_2.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/samples/eligen_example_7_6.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/title.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
# Qwen-Image Precise Region Control Model
|
| 5 |
+
|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
## Model Introduction
|
| 9 |
+
|
| 10 |
+
This model is the V2 version of a precise region control model trained based on [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image). The model architecture uses LoRA, enabling control over the position and shape of each entity by providing textual descriptions and regional conditions (mask maps) for each entity. The training framework is built on [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio), and the dataset used is the [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset).
|
| 11 |
+
|
| 12 |
+
Compared to the [V1](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen) version, this model is trained on a self-generated dataset from Qwen-Image, resulting in generated images whose styles are more consistent with the base model.
|
| 13 |
+
|
| 14 |
+
## Result Demonstration
|
| 15 |
+
|
| 16 |
+
|Entity Control Condition|Generated Image 1|Generated Image 2|Generated Image 3|
|
| 17 |
+
|-|-|-|-|
|
| 18 |
+
|||||
|
| 19 |
+
|||||
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## Inference Code
|
| 23 |
+
```
|
| 24 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 25 |
+
cd DiffSynth-Studio
|
| 26 |
+
pip install -e .
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 31 |
+
from modelscope import dataset_snapshot_download, snapshot_download
|
| 32 |
+
import torch
|
| 33 |
+
from PIL import Image
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 38 |
+
torch_dtype=torch.bfloat16,
|
| 39 |
+
device="cuda",
|
| 40 |
+
model_configs=[
|
| 41 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
|
| 42 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 43 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 44 |
+
],
|
| 45 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen-V2", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-V2", allow_file_pattern="model.safetensors")
|
| 49 |
+
pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-V2/model.safetensors")
|
| 50 |
+
|
| 51 |
+
global_prompt = "Poster for the Qwen-Image-EliGen Magic Café, featuring two magical coffees—one emitting flames and the other emitting ice spikes—against a light blue misty background. The poster includes text: 'Qwen-Image-EliGen Magic Café' and 'New Product Launch'"
|
| 52 |
+
entity_prompts = ["A red magic coffee with flames rising from the cup",
|
| 53 |
+
"A red magic coffee surrounded by ice spikes",
|
| 54 |
+
"Text: 'New Product Launch'",
|
| 55 |
+
"Text: 'Qwen-Image-EliGen Magic Café'"]
|
| 56 |
+
|
| 57 |
+
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_6/*.png")
|
| 58 |
+
masks = [Image.open(f"./data/examples/eligen/qwen-image/example_6/{i}.png").convert('RGB').resize((1328, 1328)) for i in range(len(entity_prompts))]
|
| 59 |
+
|
| 60 |
+
image = pipe(
|
| 61 |
+
prompt=global_prompt,
|
| 62 |
+
seed=0,
|
| 63 |
+
eligen_entity_prompts=entity_prompts,
|
| 64 |
+
eligen_entity_masks=masks,
|
| 65 |
+
)
|
| 66 |
+
image.save("image.jpg")
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Citation
|
| 70 |
+
If you find our work helpful, please consider citing our research:
|
| 71 |
+
```
|
| 72 |
+
@article{zhang2025eligen,
|
| 73 |
+
title={Eligen: Entity-level Controlled Image Generation with Regional Attention},
|
| 74 |
+
author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
|
| 75 |
+
journal={arXiv preprint arXiv:2501.01097},
|
| 76 |
+
year={2025}
|
| 77 |
+
}
|
| 78 |
+
```
|
README_from_modelscope.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
frameworks:
|
| 3 |
+
- Pytorch
|
| 4 |
+
license: Apache License 2.0
|
| 5 |
+
tasks:
|
| 6 |
+
- text-to-image-synthesis
|
| 7 |
+
|
| 8 |
+
#model-type:
|
| 9 |
+
##如 gpt、phi、llama、chatglm、baichuan 等
|
| 10 |
+
#- gpt
|
| 11 |
+
|
| 12 |
+
#domain:
|
| 13 |
+
##如 nlp、cv、audio、multi-modal
|
| 14 |
+
#- nlp
|
| 15 |
+
|
| 16 |
+
#language:
|
| 17 |
+
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
|
| 18 |
+
#- cn
|
| 19 |
+
|
| 20 |
+
#metrics:
|
| 21 |
+
##如 CIDEr、Blue、ROUGE 等
|
| 22 |
+
#- CIDEr
|
| 23 |
+
|
| 24 |
+
#tags:
|
| 25 |
+
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
|
| 26 |
+
#- pretrained
|
| 27 |
+
|
| 28 |
+
#tools:
|
| 29 |
+
##如 vllm、fastchat、llamacpp、AdaSeq 等
|
| 30 |
+
#- vllm
|
| 31 |
+
base_model:
|
| 32 |
+
- Qwen/Qwen-Image
|
| 33 |
+
base_model_relation: adapter
|
| 34 |
+
---
|
| 35 |
+
# Qwen-Image 精确分区控制模型
|
| 36 |
+
|
| 37 |
+

|
| 38 |
+
|
| 39 |
+
## 模型介绍
|
| 40 |
+
|
| 41 |
+
本模型是基于 [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 训练的精确分区控制模型 V2 版本,模型结构为 LoRA,可以通过输入每个实体的文本和区域条件(蒙版图)来控制每个实体的位置和形状。训练框架基于 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 构建,采用的数据集是 [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset)。
|
| 42 |
+
|
| 43 |
+
相比于 [V1](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen) 版本,模型采用 Qwen-Image 自生成的数据集训练,生成图像的风格更符合基模。
|
| 44 |
+
|
| 45 |
+
## 效果展示
|
| 46 |
+
|
| 47 |
+
|实体控制条件|生成图1|生成图2|生成图3|
|
| 48 |
+
|-|-|-|-|
|
| 49 |
+
|||||
|
| 50 |
+
|||||
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
## 推理代码
|
| 54 |
+
```
|
| 55 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 56 |
+
cd DiffSynth-Studio
|
| 57 |
+
pip install -e .
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 62 |
+
from modelscope import dataset_snapshot_download, snapshot_download
|
| 63 |
+
import torch
|
| 64 |
+
from PIL import Image
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 68 |
+
torch_dtype=torch.bfloat16,
|
| 69 |
+
device="cuda",
|
| 70 |
+
model_configs=[
|
| 71 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
|
| 72 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 73 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 74 |
+
],
|
| 75 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 76 |
+
)
|
| 77 |
+
snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen-V2", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen-V2", allow_file_pattern="model.safetensors")
|
| 78 |
+
pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen-V2/model.safetensors")
|
| 79 |
+
|
| 80 |
+
global_prompt = "Qwen-Image-EliGen魔法咖啡厅的宣传海报,主体是两杯魔法咖啡,一杯冒着火焰,一杯冒着冰锥,背景是浅蓝色水雾,海报写着“Qwen-Image-EliGen魔法咖啡厅”、“新品上市”"
|
| 81 |
+
entity_prompts = ["一杯红色魔法咖啡,杯中火焰燃烧", "一杯红色魔法咖啡,杯中冰锥环绕", "字:“新品上市”", "字:“Qwen-Image-EliGen魔法咖啡厅”"]
|
| 82 |
+
|
| 83 |
+
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_6/*.png")
|
| 84 |
+
masks = [Image.open(f"./data/examples/eligen/qwen-image/example_6/{i}.png").convert('RGB').resize((1328, 1328)) for i in range(len(entity_prompts))]
|
| 85 |
+
|
| 86 |
+
image = pipe(
|
| 87 |
+
prompt=global_prompt,
|
| 88 |
+
seed=0,
|
| 89 |
+
eligen_entity_prompts=entity_prompts,
|
| 90 |
+
eligen_entity_masks=masks,
|
| 91 |
+
)
|
| 92 |
+
image.save("image.jpg")
|
| 93 |
+
```
|
| 94 |
+
## 引用
|
| 95 |
+
如果您觉得我们的工作对您有所帮助,欢迎引用我们的成果。
|
| 96 |
+
```
|
| 97 |
+
@article{zhang2025eligen,
|
| 98 |
+
title={Eligen: Entity-level controlled image generation with regional attention},
|
| 99 |
+
author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
|
| 100 |
+
journal={arXiv preprint arXiv:2501.01097},
|
| 101 |
+
year={2025}
|
| 102 |
+
}
|
| 103 |
+
```
|
assets/samples/eligen_example_4_2.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_4_5.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_4_6.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_4_mask_3.png
ADDED
|
assets/samples/eligen_example_7_0.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_7_2.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_7_6.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_7_mask_3.png
ADDED
|
assets/title.png
ADDED
|
Git LFS Details
|
configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework":"Pytorch","task":"text-to-image-synthesis"}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee3bdf8186880e29ace936c547137a0b72fb2c02bc7025e8d5b1f40d3fd2a03e
|
| 3 |
+
size 472047184
|