File size: 15,648 Bytes
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6391fa9
92c3d7f
 
 
 
 
 
 
 
 
 
 
ecd462b
92c3d7f
 
 
 
ecd462b
92c3d7f
 
 
 
ecd462b
 
 
 
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecd462b
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84ade0a
33976bc
 
 
 
 
 
 
 
 
 
 
92c3d7f
 
 
 
 
 
 
 
 
 
 
ecd462b
92c3d7f
 
 
ecd462b
 
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc04eca
 
92c3d7f
 
cc04eca
 
 
 
 
 
 
 
 
 
 
92c3d7f
 
 
 
 
25dcda0
92c3d7f
 
 
 
 
 
 
 
 
 
 
6391fa9
 
92c3d7f
 
 
 
 
 
 
 
122eb28
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33976bc
 
92c3d7f
 
 
 
 
 
33976bc
 
92c3d7f
 
 
 
 
 
33976bc
 
92c3d7f
 
ecd462b
92c3d7f
 
 
33976bc
 
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
33976bc
92c3d7f
33976bc
92c3d7f
 
 
 
 
33976bc
92c3d7f
33976bc
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122eb28
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6391fa9
92c3d7f
fde6683
 
92c3d7f
 
 
 
 
 
 
 
 
 
122eb28
92c3d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16f8b88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import spaces
import argparse

import os
import shutil
import cv2
import gradio as gr
import numpy as np
import torch
from facexlib.utils.face_restoration_helper import FaceRestoreHelper
import huggingface_hub
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.transforms.functional import normalize

from dreamo.dreamo_pipeline import DreamOPipeline
from dreamo.utils import img2tensor, resize_numpy_image_area, tensor2img, resize_numpy_image_long
from tools import BEN2

parser = argparse.ArgumentParser()
parser.add_argument('--port', type=int, default=8080)
parser.add_argument('--no_turbo', action='store_true')
args = parser.parse_args()

huggingface_hub.login(os.getenv('HF_TOKEN'))

try:
    shutil.rmtree('gradio_cached_examples')
except FileNotFoundError:
    print("cache folder not exist")

class Generator:
    def __init__(self):
        device = torch.device('cuda')
        # preprocessing models
        # background remove model: BEN2
        self.bg_rm_model = BEN2.BEN_Base().to(device).eval()
        hf_hub_download(repo_id='PramaLLC/BEN2', filename='BEN2_Base.pth', local_dir='models')
        self.bg_rm_model.loadcheckpoints('models/BEN2_Base.pth')
        # face crop and align tool: facexlib
        self.face_helper = FaceRestoreHelper(
            upscale_factor=1,
            face_size=512,
            crop_ratio=(1, 1),
            det_model='retinaface_resnet50',
            save_ext='png',
            device=device,
        )

        # load dreamo
        model_root = 'black-forest-labs/FLUX.1-dev'
        dreamo_pipeline = DreamOPipeline.from_pretrained(model_root, torch_dtype=torch.bfloat16)
        dreamo_pipeline.load_dreamo_model(device, use_turbo=not args.no_turbo)
        self.dreamo_pipeline = dreamo_pipeline.to(device)

    @torch.no_grad()
    def get_align_face(self, img):
        # the face preprocessing code is same as PuLID
        self.face_helper.clean_all()
        image_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        self.face_helper.read_image(image_bgr)
        self.face_helper.get_face_landmarks_5(only_center_face=True)
        self.face_helper.align_warp_face()
        if len(self.face_helper.cropped_faces) == 0:
            return None
        align_face = self.face_helper.cropped_faces[0]

        input = img2tensor(align_face, bgr2rgb=True).unsqueeze(0) / 255.0
        input = input.to(torch.device("cuda"))
        parsing_out = self.face_helper.face_parse(normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
        parsing_out = parsing_out.argmax(dim=1, keepdim=True)
        bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
        bg = sum(parsing_out == i for i in bg_label).bool()
        white_image = torch.ones_like(input)
        # only keep the face features
        face_features_image = torch.where(bg, white_image, input)
        face_features_image = tensor2img(face_features_image, rgb2bgr=False)

        return face_features_image


generator = Generator()


@spaces.GPU
@torch.inference_mode()
def generate_image(
    ref_image1,
    ref_image2,
    ref_task1,
    ref_task2,
    prompt,
    seed,
    width=1024,
    height=1024,
    ref_res=512,
    num_steps=12,
    guidance=3.5,
    true_cfg=1,
    cfg_start_step=0,
    cfg_end_step=0,
    neg_prompt='',
    neg_guidance=3.5,
    first_step_guidance=0,
):
    print(prompt)
    ref_conds = []
    debug_images = []

    ref_images = [ref_image1, ref_image2]
    ref_tasks = [ref_task1, ref_task2]

    for idx, (ref_image, ref_task) in enumerate(zip(ref_images, ref_tasks)):
        if ref_image is not None:
            if ref_task == "id":
                ref_image = resize_numpy_image_long(ref_image, 1024)
                ref_image = generator.get_align_face(ref_image)
            elif ref_task != "style":
                ref_image = generator.bg_rm_model.inference(Image.fromarray(ref_image))
            if ref_task != "id":
                ref_image = resize_numpy_image_area(np.array(ref_image), ref_res * ref_res)
            debug_images.append(ref_image)
            ref_image = img2tensor(ref_image, bgr2rgb=False).unsqueeze(0) / 255.0
            ref_image = 2 * ref_image - 1.0
            ref_conds.append(
                {
                    'img': ref_image,
                    'task': ref_task,
                    'idx': idx + 1,
                }
            )

    seed = int(seed)
    if seed == -1:
        seed = torch.Generator(device="cpu").seed()

    image = generator.dreamo_pipeline(
        prompt=prompt,
        width=width,
        height=height,
        num_inference_steps=num_steps,
        guidance_scale=guidance,
        ref_conds=ref_conds,
        generator=torch.Generator(device="cpu").manual_seed(seed),
        true_cfg_scale=true_cfg,
        true_cfg_start_step=cfg_start_step,
        true_cfg_end_step=cfg_end_step,
        negative_prompt=neg_prompt,
        neg_guidance_scale=neg_guidance,
        first_step_guidance_scale=first_step_guidance if first_step_guidance > 0 else guidance,
    ).images[0]

    return image, debug_images, seed


_HEADER_ = '''
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
    <h1 style="font-size: 2.5rem; font-weight: 700; margin-bottom: 1rem; display: contents;">ihome AI Design</h1>
    <p style="font-size: 1rem; margin-bottom: 1.5rem;"> ihome AI Design: 家居AI图片处理</a> | </p>
</div>

核心功能:轻松“换装”您的家居场景!您可以上传一张家居单品(如沙发、灯具)的图片,通过文字描述您想要的房间风格,模型会将其无缝融入到一个全新的家居环境照片中,或者添加到您指定的现有房间照片里。
重要提示: 请务必先尝试下方的示例,这将帮助您更好地理解我们模型在家居场景生成与修改方面的能力以及目前支持的操作类型。
为每个输入选择合适的任务类型:
若为单个家居物品(如沙发、椅子、灯具等): 选择 “家居物品放置”(Item Placement) 模式。我们会自动识别并尝试移除该物品的原始背景。您只需在提示词 (prompt) 中描述希望它融入的房间环境和风格(例如:“将这个沙发放在一个现代简约风格的客厅窗边”),即可生成全新的家居场景图。
若为完整家居场景图(用于风格参考或局部修改): 选择 “场景风格编辑”(Scene Style/Edit) 模式。此模式下,原图的整体环境和风格将被保留。
如需进行风格化生成 (例如,将一个空房间照片变得充满某种特定风格并添置家具),您需要在提示词前加上:'generate a same style image.' (生成一张同样风格的图片) 来激活此任务,并描述您想添加或修改的内容。
如需在现有场景中直接添加或替换物品,请直接在提示词中描述您的具体操作(例如:“在图片中的壁炉旁添加这个上传的椅子”或“把图片中的茶几替换成一个圆形的木质茶几”)。
关键参数 - 引导强度 (Guidance Scale): 默认值为 3.5。
如果您发现家具材质显得过于光亮、不真实,或者物品与环境光照融合不自然,可以适当降低引导强度 (例如调整到 3.0)。
反之,如果家具或场景细节呈现不佳,或者物品摆放出现扭曲变形、与场景透视不符,可以尝试提高引导强度 (例如调整到 4.0)。
加速推理: 我们采用了 FLUX-turbo LoRA 技术,将采样步数从25步减少到12步(相较于FLUX-dev版本)。此外,我们还蒸馏了CFG LoRA,通过免除真正的CFG计算,实现了近两倍的步数削减,大幅提升了生成速度
'''  # noqa E501

_CITE_ = r"""
---


"""  # noqa E501


def create_demo():

    with gr.Blocks() as demo:
        gr.Markdown(_HEADER_)

        with gr.Row():
            with gr.Column():
                with gr.Row():
                    ref_image1 = gr.Image(label="ref image 1", type="numpy", height=256)
                    ref_image2 = gr.Image(label="ref image 2", type="numpy", height=256)
                with gr.Row():
                    ref_task1 = gr.Dropdown(choices=["ip", "id", "style"], value="ip", label="task for ref image 1")
                    ref_task2 = gr.Dropdown(choices=["ip", "id", "style"], value="ip", label="task for ref image 2")
                prompt = gr.Textbox(label="Prompt", value="a person playing guitar in the street")
                width = gr.Slider(768, 1024, 1024, step=16, label="Width")
                height = gr.Slider(768, 1024, 1024, step=16, label="Height")
                num_steps = gr.Slider(8, 30, 12, step=1, label="Number of steps")
                guidance = gr.Slider(1.0, 10.0, 3.5, step=0.1, label="Guidance")
                seed = gr.Textbox(label="Seed (-1 for random)", value="-1")
                with gr.Accordion("Advanced Options", open=False, visible=False):
                    ref_res = gr.Slider(512, 1024, 512, step=16, label="resolution for ref image")
                    neg_prompt = gr.Textbox(label="Neg Prompt", value="")
                    neg_guidance = gr.Slider(1.0, 10.0, 3.5, step=0.1, label="Neg Guidance")
                    true_cfg = gr.Slider(1, 5, 1, step=0.1, label="true cfg")
                    cfg_start_step = gr.Slider(0, 30, 0, step=1, label="cfg start step")
                    cfg_end_step = gr.Slider(0, 30, 0, step=1, label="cfg end step")
                    first_step_guidance = gr.Slider(0, 10, 0, step=0.1, label="first step guidance")
                generate_btn = gr.Button("Generate")
                gr.Markdown(_CITE_)

            with gr.Column():
                output_image = gr.Image(label="Generated Image", format='png')
                debug_image = gr.Gallery(
                    label="Preprocessing output (including possible face crop and background remove)",
                    elem_id="gallery",
                )
                seed_output = gr.Textbox(label="Used Seed")

        with gr.Row(), gr.Column():
            gr.Markdown("## Examples")
            example_inps = [
                [
                    'example_inputs/woman1.png',
                    None,
                    'ip',
                    'ip',
                    'profile shot dark photo of a 25-year-old female with smoke escaping from her mouth, the backlit smoke gives the image an ephemeral quality, natural face, natural eyebrows, natural skin texture, award winning photo, highly detailed face, atmospheric lighting, film grain, monochrome',  # noqa E501
                    9180879731249039735,
                ],
                [
                    'example_inputs/man1.png',
                    None,
                    'ip',
                    'ip',
                    'a man sitting on the cloud, playing guitar',
                    1206523688721442817,
                ],
                [
                    'example_inputs/toy1.png',
                    None,
                    'ip',
                    'ip',
                    'a purple toy holding a sign saying "DreamO", on the mountain',
                    10441727852953907380,
                ],
                [
                    'example_inputs/perfume.png',
                    None,
                    'ip',
                    'ip',
                    'a perfume under spotlight',
                    116150031980664704,
                ],
                [
                    'example_inputs/hinton.jpeg',
                    None,
                    'id',
                    'ip',
                    'portrait, Chibi',
                    5443415087540486371,
                ],
                [
                    'example_inputs/mickey.png',
                    None,
                    'style',
                    'ip',
                    'generate a same style image. A rooster wearing overalls.',
                    6245580464677124951,
                ],
                [
                    'example_inputs/mountain.png',
                    None,
                    'style',
                    'ip',
                    'generate a same style image. A pavilion by the river, and the distant mountains are endless',
                    5248066378927500767,
                ],
                [
                    'example_inputs/shirt.png',
                    'example_inputs/skirt.jpeg',
                    'ip',
                    'ip',
                    'A girl is wearing a short-sleeved shirt and a short skirt on the beach.',
                    9514069256241143615,
                ],
                [
                    'example_inputs/woman2.png',
                    'example_inputs/dress.png',
                    'id',
                    'ip',
                    'the woman wearing a dress, In the banquet hall',
                    7698454872441022867,
                ],
                [
                    'example_inputs/dog1.png',
                    'example_inputs/dog2.png',
                    'ip',
                    'ip',
                    'two dogs in the jungle',
                    6187006025405083344,
                ],
                [
                    'example_inputs/woman3.png',
                    'example_inputs/cat.png',
                    'ip',
                    'ip',
                    'A girl rides a giant cat, walking in the noisy modern city. High definition, realistic, non-cartoonish. Excellent photography work, 8k high definition.',  # noqa E501
                    11980469406460273604,
                ],
                [
                    'example_inputs/man2.jpeg',
                    'example_inputs/woman4.jpeg',
                    'ip',
                    'ip',
                    'a man is dancing with a woman in the room',
                    8303780338601106219,
                ],
            ]
            gr.Examples(
                examples=example_inps,
                inputs=[ref_image1, ref_image2, ref_task1, ref_task2, prompt, seed],
                label='row 1-4: IP task; row 5: ID task; row 6-7: Style task. row 8-9: Try-On task; row 10-12: Multi IP',
                cache_examples='lazy',
                outputs=[output_image, debug_image, seed_output],
                fn=generate_image,
            )

        generate_btn.click(
            fn=generate_image,
            inputs=[
                ref_image1,
                ref_image2,
                ref_task1,
                ref_task2,
                prompt,
                seed,
                width,
                height,
                ref_res,
                num_steps,
                guidance,
                true_cfg,
                cfg_start_step,
                cfg_end_step,
                neg_prompt,
                neg_guidance,
                first_step_guidance,
            ],
            outputs=[output_image, debug_image, seed_output],
        )

    return demo


if __name__ == '__main__':
    demo = create_demo()
    demo.launch()