Spaces:

HBDing
/

LayoutPainter

Running

App Files Files Community

HBDing commited on May 17

Commit

f53fb95

1 Parent(s): 796dc92

upd

Browse files

Files changed (7) hide show

Dockerfile +6 -0
app.py +40 -5
migc/migc_arch.py +220 -0
migc/migc_layers.py +241 -0
migc/migc_pipeline.py +928 -0
migc/migc_utils.py +143 -0
requirements.txt +3 -1

Dockerfile CHANGED Viewed

@@ -1,6 +1,12 @@
 FROM python:3.10
 WORKDIR /code
 COPY --link --chown=1000 . .

 FROM python:3.10
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
 WORKDIR /code
 COPY --link --chown=1000 . .

app.py CHANGED Viewed

@@ -4,14 +4,44 @@ from gradio_image_annotation import image_annotator
 from diffusers import StableDiffusionPipeline
 import os
 import torch
 # Load model
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-).to("cuda" if torch.cuda.is_available() else "cpu")
 pipe.safety_checker = None
 example_annotation = {
     "image": os.path.join(os.path.dirname(__file__), "background.png"),
@@ -26,15 +56,20 @@ def get_boxes_json(annotations):
     height = image.shape[0]
     boxes = annotations["boxes"]
     prompt_final = [[]]
     for box in boxes:
         box["xmin"] = box["xmin"] / width
         box["xmax"] = box["xmax"] / width
         box["ymin"] = box["ymin"] / height
         box["ymax"] = box["ymax"] / height
         prompt_final[0].append(box["label"])
     # import pdb; pdb.set_trace()
     prompt = ", ".join(prompt_final[0])
-    image = pipe(prompt).images[0]
     return image
     # return annotations["boxes"]

 from diffusers import StableDiffusionPipeline
 import os
 import torch
+from diffusers import EulerDiscreteScheduler
+from migc.migc_utils import seed_everything
+from migc.migc_pipeline import StableDiffusionMIGCPipeline, MIGCProcessor, AttentionStore
+from huggingface_hub import hf_hub_download
+# 下载文件
+migc_ckpt_path = hf_hub_download(
+    repo_id="limuloo1999/MIGC",
+    filename="MIGC_SD14.ckpt",
+    repo_type="model"  # 也可以省略，默认就是 model
+)
+RV_path = hf_hub_download(
+    repo_id="SG161222/Realistic_Vision_V6.0_B1_noVAE",
+    filename="Realistic_Vision_V6.0_NV_B1.safetensors",
+    repo_type="model"  # 也可以省略，默认就是 model
+)
 # Load model
+# pipe = StableDiffusionMIGCPipeline.from_pretrained(
+#     "rSG161222/Realistic_Vision_V6.0_B1_noVAE",
+#     torch_dtype=torch.float32
+# )
+pipe = StableDiffusionMIGCPipeline.from_single_file(
+    RV_path,
+    torch_dtype=torch.float32
+)
 pipe.safety_checker = None
+pipe.attention_store = AttentionStore()
+from migc.migc_utils import load_migc
+load_migc(pipe.unet , pipe.attention_store,
+        migc_ckpt_path, attn_processor=MIGCProcessor)
+pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 example_annotation = {
     "image": os.path.join(os.path.dirname(__file__), "background.png"),
     height = image.shape[0]
     boxes = annotations["boxes"]
     prompt_final = [[]]
+    bboxes = [[]]
     for box in boxes:
         box["xmin"] = box["xmin"] / width
         box["xmax"] = box["xmax"] / width
         box["ymin"] = box["ymin"] / height
         box["ymax"] = box["ymax"] / height
         prompt_final[0].append(box["label"])
+        bboxes[0].append([box["xmin"], box["ymin"], box["xmax"], box["ymax"]])
     # import pdb; pdb.set_trace()
     prompt = ", ".join(prompt_final[0])
+    prompt_final[0].insert(0, prompt)
+    negative_prompt = 'worst quality, low quality, bad anatomy, watermark, text, blurry'
+    image = pipe(prompt_final, bboxes, num_inference_steps=30, guidance_scale=7.5,
+                    MIGCsteps=15, aug_phase_with_and=False, negative_prompt=negative_prompt).images[0]
     return image
     # return annotations["boxes"]

migc/migc_arch.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from migc.migc_layers import CBAM, CrossAttention, LayoutAttention
+class FourierEmbedder():
+    def __init__(self, num_freqs=64, temperature=100):
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+        self.freq_bands = temperature ** ( torch.arange(num_freqs) / num_freqs )
+    @ torch.no_grad()
+    def __call__(self, x, cat_dim=-1):
+        out = []
+        for freq in self.freq_bands:
+            out.append( torch.sin( freq*x ) )
+            out.append( torch.cos( freq*x ) )
+        return torch.cat(out, cat_dim)  # torch.Size([5, 30, 64])
+class PositionNet(nn.Module):
+    def __init__(self, in_dim, out_dim, fourier_freqs=8):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4  # 2 is sin&cos, 4 is xyxy
+        # -------------------------------------------------------------- #
+        self.linears_position = nn.Sequential(
+            nn.Linear(self.position_dim, 512),
+            nn.SiLU(),
+            nn.Linear(512, 512),
+            nn.SiLU(),
+            nn.Linear(512, out_dim),
+        )
+    def forward(self, boxes):
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = self.fourier_embedder(boxes)  # B*1*4 --> B*1*C torch.Size([5, 1, 64])
+        xyxy_embedding = self.linears_position(xyxy_embedding)  # B*1*C --> B*1*768 torch.Size([5, 1, 768])
+        return xyxy_embedding
+class SAC(nn.Module):
+    def __init__(self, C, number_pro=30):
+        super().__init__()
+        self.C = C
+        self.number_pro = number_pro
+        self.conv1 = nn.Conv2d(C + 1, C, 1, 1)
+        self.cbam1 = CBAM(C)
+        self.conv2 = nn.Conv2d(C, 1, 1, 1)
+        self.cbam2 = CBAM(number_pro, reduction_ratio=1)
+    def forward(self, x, guidance_mask, sac_scale=None):
+        '''
+        :param x: (B, phase_num, HW, C)
+        :param guidance_mask: (B, phase_num, H, W)
+        :return:
+        '''
+        B, phase_num, HW, C = x.shape
+        _, _, H, W = guidance_mask.shape
+        guidance_mask = guidance_mask.view(guidance_mask.shape[0], phase_num, -1)[
+            ..., None]  # (B, phase_num, HW, 1)
+        null_x = torch.zeros_like(x[:, [0], ...]).to(x.device)
+        null_mask = torch.zeros_like(guidance_mask[:, [0], ...]).to(guidance_mask.device)
+        x = torch.cat([x, null_x], dim=1)
+        guidance_mask = torch.cat([guidance_mask, null_mask], dim=1)
+        phase_num += 1
+        scale = torch.cat([x, guidance_mask], dim=-1)  # (B, phase_num, HW, C+1)
+        scale = scale.view(-1, H, W, C + 1)  # (B * phase_num, H, W, C+1)
+        scale = scale.permute(0, 3, 1, 2)  # (B * phase_num, C+1, H, W)
+        scale = self.conv1(scale)  # (B * phase_num, C, H, W)
+        scale = self.cbam1(scale)  # (B * phase_num, C, H, W)
+        scale = self.conv2(scale)  # (B * phase_num, 1, H, W)
+        scale = scale.view(B, phase_num, H, W)  # (B, phase_num, H, W)
+        null_scale = scale[:, [-1], ...]
+        scale = scale[:, :-1, ...]
+        x = x[:, :-1, ...]
+        pad_num = self.number_pro - phase_num + 1
+        ori_phase_num = scale[:, 1:-1, ...].shape[1]
+        phase_scale = torch.cat([scale[:, 1:-1, ...], null_scale.repeat(1, pad_num, 1, 1)], dim=1)
+        shuffled_order = torch.randperm(phase_scale.shape[1])
+        inv_shuffled_order = torch.argsort(shuffled_order)
+        random_phase_scale = phase_scale[:, shuffled_order, ...]
+        scale = torch.cat([scale[:, [0], ...], random_phase_scale, scale[:, [-1], ...]], dim=1)
+        # (B, number_pro, H, W)
+        scale = self.cbam2(scale)  # (B, number_pro, H, W)
+        scale = scale.view(B, self.number_pro, HW)[..., None]  # (B, number_pro, HW)
+        random_phase_scale = scale[:, 1: -1, ...]
+        phase_scale = random_phase_scale[:, inv_shuffled_order[:ori_phase_num], :]
+        if sac_scale is not None:
+            instance_num = len(sac_scale)
+            for i in range(instance_num):
+                phase_scale[:, i, ...] = phase_scale[:, i, ...] * sac_scale[i]
+        scale = torch.cat([scale[:, [0], ...], phase_scale, scale[:, [-1], ...]], dim=1)
+        scale = scale.softmax(dim=1)  # (B, phase_num, HW, 1)
+        out = (x * scale).sum(dim=1, keepdims=True)  # (B, 1, HW, C)
+        return out, scale
+class MIGC(nn.Module):
+    def __init__(self, C, attn_type='base', context_dim=768, heads=8):
+        super().__init__()
+        self.ea = CrossAttention(query_dim=C, context_dim=context_dim,
+                             heads=heads, dim_head=C // heads,
+                             dropout=0.0)
+        self.la = LayoutAttention(query_dim=C,
+                                    heads=heads, dim_head=C // heads,
+                                    dropout=0.0)
+        self.norm = nn.LayerNorm(C)
+        self.sac = SAC(C)
+        self.pos_net = PositionNet(in_dim=768, out_dim=768)
+    def forward(self, ca_x, guidance_mask, other_info, return_fuser_info=False):
+        # x: (B, instance_num+1, HW, C)
+        # guidance_mask: (B, instance_num, H, W)
+        # box: (instance_num, 4)
+        # image_token: (B, instance_num+1, HW, C)
+        full_H = other_info['height']
+        full_W = other_info['width']
+        B, _, HW, C = ca_x.shape
+        instance_num = guidance_mask.shape[1]
+        down_scale = int(math.sqrt(full_H * full_W // ca_x.shape[2]))
+        H = full_H // down_scale
+        W = full_W // down_scale
+        guidance_mask = F.interpolate(guidance_mask, size=(H, W), mode='bilinear')   # (B, instance_num, H, W)
+        supplement_mask = other_info['supplement_mask']  # (B, 1, 64, 64)
+        supplement_mask = F.interpolate(supplement_mask, size=(H, W), mode='bilinear')  # (B, 1, H, W)
+        image_token = other_info['image_token']
+        assert image_token.shape == ca_x.shape
+        context = other_info['context_pooler']
+        box = other_info['box']
+        box = box.view(B * instance_num, 1, -1)
+        box_token = self.pos_net(box)
+        context = torch.cat([context[1:, ...], box_token], dim=1)
+        ca_scale = other_info['ca_scale'] if 'ca_scale' in other_info else None
+        ea_scale = other_info['ea_scale'] if 'ea_scale' in other_info else None
+        sac_scale = other_info['sac_scale'] if 'sac_scale' in other_info else None
+        ea_x, ea_attn = self.ea(self.norm(image_token[:, 1:, ...].view(B * instance_num, HW, C)),
+                                     context=context, return_attn=True)
+        ea_x = ea_x.view(B, instance_num, HW, C)
+        ea_x = ea_x * guidance_mask.view(B, instance_num, HW, 1)
+        ca_x[:, 1:, ...] = ca_x[:, 1:, ...] * guidance_mask.view(B, instance_num, HW, 1)  # (B, phase_num, HW, C)
+        if ca_scale is not None:
+            assert len(ca_scale) == instance_num
+            for i in range(instance_num):
+                ca_x[:, i+1, ...] = ca_x[:, i+1, ...] * ca_scale[i] + ea_x[:, i, ...] * ea_scale[i]
+        else:
+            ca_x[:, 1:, ...] = ca_x[:, 1:, ...] + ea_x
+        ori_image_token = image_token[:, 0, ...]  # (B, HW, C)
+        fusion_template = self.la(x=ori_image_token, guidance_mask=torch.cat([guidance_mask[:, :, ...], supplement_mask], dim=1))  # (B, HW, C)
+        fusion_template = fusion_template.view(B, 1, HW, C)  # (B, 1, HW, C)
+        ca_x = torch.cat([ca_x, fusion_template], dim = 1)
+        ca_x[:, 0, ...] = ca_x[:, 0, ...] * supplement_mask.view(B, HW, 1)
+        guidance_mask = torch.cat([
+            supplement_mask,
+            guidance_mask,
+            torch.ones(B, 1, H, W).to(guidance_mask.device)
+            ], dim=1)
+        out_MIGC, sac_scale = self.sac(ca_x, guidance_mask, sac_scale=sac_scale)
+        if return_fuser_info:
+            fuser_info = {}
+            fuser_info['sac_scale'] = sac_scale.view(B, instance_num + 2, H, W)
+            fuser_info['ea_attn'] = ea_attn.mean(dim=1).view(B, instance_num, H, W, 2)
+            return out_MIGC, fuser_info
+        else:
+            return out_MIGC
+class NaiveFuser(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, ca_x, guidance_mask, other_info, return_fuser_info=False):
+        # ca_x: (B, instance_num+1, HW, C)
+        # guidance_mask: (B, instance_num, H, W)
+        # box: (instance_num, 4)
+        # image_token: (B, instance_num+1, HW, C)
+        full_H = other_info['height']
+        full_W = other_info['width']
+        B, _, HW, C = ca_x.shape
+        instance_num = guidance_mask.shape[1]
+        down_scale = int(math.sqrt(full_H * full_W // ca_x.shape[2]))
+        H = full_H // down_scale
+        W = full_W // down_scale
+        guidance_mask = F.interpolate(guidance_mask, size=(H, W), mode='bilinear')   # (B, instance_num, H, W)
+        guidance_mask = torch.cat([torch.ones(B, 1, H, W).to(guidance_mask.device), guidance_mask * 10], dim=1)  # (B, instance_num+1, H, W)
+        guidance_mask = guidance_mask.view(B, instance_num + 1, HW, 1)
+        out_MIGC = (ca_x * guidance_mask).sum(dim=1) / (guidance_mask.sum(dim=1) + 1e-6)
+        if return_fuser_info:
+            return out_MIGC, None
+        else:
+            return out_MIGC

migc/migc_layers.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+import math
+from inspect import isfunction
+from einops import rearrange, repeat
+from torch import nn, einsum
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None, return_attn=False, need_softmax=True, guidance_mask=None,
+                forward_layout_guidance=False):
+        h = self.heads
+        b = x.shape[0]
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        if forward_layout_guidance:
+            # sim: (B * phase_num * h, HW, 77), b = B * phase_num
+            # guidance_mask: (B, phase_num, 64, 64)
+            HW = sim.shape[1]
+            H = W = int(math.sqrt(HW))
+            guidance_mask = F.interpolate(guidance_mask, size=(H, W), mode='nearest')  # (B, phase_num, H, W)
+            sim = sim.view(b, h, HW, 77)
+            guidance_mask = guidance_mask.view(b, 1, HW, 1)
+            guidance_mask[guidance_mask == 1] = 5.0
+            guidance_mask[guidance_mask == 0] = 0.1
+            sim[:, :, :, 1:] = sim[:, :, :, 1:] * guidance_mask
+            sim = sim.view(b * h, HW, 77)
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        if need_softmax:
+            attn = sim.softmax(dim=-1)
+        else:
+            attn = sim
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        if return_attn:
+            attn = attn.view(b, h, attn.shape[-2], attn.shape[-1])
+            return self.to_out(out), attn
+        else:
+            return self.to_out(out)
+class LayoutAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., use_lora=False):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.use_lora = use_lora
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None, return_attn=False, need_softmax=True, guidance_mask=None):
+        h = self.heads
+        b = x.shape[0]
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        _, phase_num, H, W = guidance_mask.shape
+        HW = H * W
+        guidance_mask_o = guidance_mask.view(b * phase_num, HW, 1)
+        guidance_mask_t = guidance_mask.view(b * phase_num, 1, HW)
+        guidance_mask_sim = torch.bmm(guidance_mask_o, guidance_mask_t)  # (B * phase_num, HW, HW)
+        guidance_mask_sim = guidance_mask_sim.view(b, phase_num, HW, HW).sum(dim=1)
+        guidance_mask_sim[guidance_mask_sim > 1] = 1  # (B, HW, HW)
+        guidance_mask_sim = guidance_mask_sim.view(b, 1, HW, HW)
+        guidance_mask_sim = guidance_mask_sim.repeat(1, self.heads, 1, 1)
+        guidance_mask_sim = guidance_mask_sim.view(b * self.heads, HW, HW)  # (B * head, HW, HW)
+        sim[:, :, :HW][guidance_mask_sim == 0] = -torch.finfo(sim.dtype).max
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        if need_softmax:
+            attn = sim.softmax(dim=-1)
+        else:
+            attn = sim
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        if return_attn:
+            attn = attn.view(b, h, attn.shape[-2], attn.shape[-1])
+            return self.to_out(out), attn
+        else:
+            return self.to_out(out)
+class BasicConv(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=False, bias=False):
+        super(BasicConv, self).__init__()
+        self.out_channels = out_planes
+        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
+        self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
+        self.relu = nn.ReLU() if relu else None
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+class ChannelGate(nn.Module):
+    def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
+        super(ChannelGate, self).__init__()
+        self.gate_channels = gate_channels
+        self.mlp = nn.Sequential(
+            Flatten(),
+            nn.Linear(gate_channels, gate_channels // reduction_ratio),
+            nn.ReLU(),
+            nn.Linear(gate_channels // reduction_ratio, gate_channels)
+            )
+        self.pool_types = pool_types
+    def forward(self, x):
+        channel_att_sum = None
+        for pool_type in self.pool_types:
+            if pool_type=='avg':
+                avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
+                channel_att_raw = self.mlp( avg_pool )
+            elif pool_type=='max':
+                max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
+                channel_att_raw = self.mlp( max_pool )
+            elif pool_type=='lp':
+                lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
+                channel_att_raw = self.mlp( lp_pool )
+            elif pool_type=='lse':
+                # LSE pool only
+                lse_pool = logsumexp_2d(x)
+                channel_att_raw = self.mlp( lse_pool )
+            if channel_att_sum is None:
+                channel_att_sum = channel_att_raw
+            else:
+                channel_att_sum = channel_att_sum + channel_att_raw
+        scale = F.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
+        return x * scale
+def logsumexp_2d(tensor):
+    tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
+    s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
+    outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
+    return outputs
+class ChannelPool(nn.Module):
+    def forward(self, x):
+        return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 )
+class SpatialGate(nn.Module):
+    def __init__(self):
+        super(SpatialGate, self).__init__()
+        kernel_size = 7
+        self.compress = ChannelPool()
+        self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
+    def forward(self, x):
+        x_compress = self.compress(x)
+        x_out = self.spatial(x_compress)
+        scale = F.sigmoid(x_out) # broadcasting
+        return x * scale
+class CBAM(nn.Module):
+    def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'], no_spatial=False):
+        super(CBAM, self).__init__()
+        self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
+        self.no_spatial=no_spatial
+        if not no_spatial:
+            self.SpatialGate = SpatialGate()
+    def forward(self, x):
+        x_out = self.ChannelGate(x)
+        if not self.no_spatial:
+            x_out = self.SpatialGate(x_out)
+        return x_out

migc/migc_pipeline.py ADDED Viewed

	@@ -0,0 +1,928 @@

+import glob
+import random
+import time
+from typing import Any, Callable, Dict, List, Optional, Union
+# import moxing as mox
+import numpy as np
+import torch
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import Attention
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipeline,
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from PIL import Image, ImageDraw, ImageFont
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+import inspect
+import os
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+# from utils import load_utils
+import argparse
+import yaml
+import cv2
+import math
+from migc.migc_arch import MIGC, NaiveFuser
+from scipy.ndimage import uniform_filter, gaussian_filter
+logger = logging.get_logger(__name__)
+class AttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down": [], "mid": [], "up": []}
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if is_cross:
+            if attn.shape[1] in self.attn_res:
+                self.step_store[place_in_unet].append(attn)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.between_steps()
+    def between_steps(self):
+        self.attention_store = self.step_store
+        self.step_store = self.get_empty_store()
+    def maps(self, block_type: str):
+        return self.attention_store[block_type]
+    def reset(self):
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self, attn_res=[64*64, 32*32, 16*16, 8*8]):
+        """
+        Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion
+        process
+        """
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.curr_step_index = 0
+        self.attn_res = attn_res
+def get_sup_mask(mask_list):
+    or_mask = np.zeros_like(mask_list[0])
+    for mask in mask_list:
+        or_mask += mask
+    or_mask[or_mask >= 1] = 1
+    sup_mask = 1 - or_mask
+    return sup_mask
+class MIGCProcessor(nn.Module):
+    def __init__(self, config, attnstore, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.place_in_unet = place_in_unet
+        self.not_use_migc = config['not_use_migc']
+        self.naive_fuser = NaiveFuser()
+        self.embedding = {}
+        if not self.not_use_migc:
+            self.migc = MIGC(config['C'])
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            prompt_nums=[],
+            bboxes=[],
+            ith=None,
+            embeds_pooler=None,
+            timestep=None,
+            height=512,
+            width=512,
+            MIGCsteps=20,
+            NaiveFuserSteps=-1,
+            ca_scale=None,
+            ea_scale=None,
+            sac_scale=None,
+            use_sa_preserve=False,
+            sa_preserve=False,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        assert(batch_size == 2, "We currently only implement sampling with batch_size=1, \
+               and we will implement sampling with batch_size=N as soon as possible.")
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        instance_num = len(bboxes[0])
+        if ith > MIGCsteps:
+            not_use_migc = True
+        else:
+            not_use_migc = self.not_use_migc
+        is_vanilla_cross = (not_use_migc and ith > NaiveFuserSteps)
+        if instance_num == 0:
+            is_vanilla_cross = True
+        is_cross = encoder_hidden_states is not None
+        ori_hidden_states = hidden_states.clone()
+        # Only Need Negative Prompt and Global Prompt.
+        if is_cross and is_vanilla_cross:
+            encoder_hidden_states = encoder_hidden_states[:2, ...]
+        # In this case, we need to use MIGC or naive_fuser, so we copy the hidden_states_cond (instance_num+1) times for QKV
+        if is_cross and not is_vanilla_cross:
+            hidden_states_uncond = hidden_states[[0], ...]
+            hidden_states_cond = hidden_states[[1], ...].repeat(instance_num + 1, 1, 1)
+            hidden_states = torch.cat([hidden_states_uncond, hidden_states_cond])
+        # QKV Operation of Vanilla Self-Attention or Cross-Attention
+        query = attn.to_q(hidden_states)
+        if (
+            not is_cross
+            and use_sa_preserve
+            and timestep.item() in self.embedding
+            and self.place_in_unet == "up"
+        ):
+            hidden_states = torch.cat((hidden_states, torch.from_numpy(self.embedding[timestep.item()]).to(hidden_states.device)), dim=1)
+        if not is_cross and sa_preserve and self.place_in_unet == "up":
+            self.embedding[timestep.item()] = ori_hidden_states.cpu().numpy()
+        encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else hidden_states
+        )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)  # 48 4096 77
+        self.attnstore(attention_probs, is_cross, self.place_in_unet)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        ###### Self-Attention Results ######
+        if not is_cross:
+            return hidden_states
+        ###### Vanilla Cross-Attention Results ######
+        if is_vanilla_cross:
+            return hidden_states
+        ###### Cross-Attention with MIGC ######
+        assert (not is_vanilla_cross)
+        # hidden_states: torch.Size([1+1+instance_num, HW, C]), the first 1 is the uncond ca output, the second 1 is the global ca output.
+        hidden_states_uncond = hidden_states[[0], ...]  # torch.Size([1, HW, C])
+        cond_ca_output = hidden_states[1: , ...].unsqueeze(0)  # torch.Size([1, 1+instance_num, 5, 64, 1280])
+        guidance_masks = []
+        in_box = []
+        # Construct Instance Guidance Mask
+        for bbox in bboxes[0]:
+            guidance_mask = np.zeros((height, width))
+            w_min = int(width * bbox[0])
+            w_max = int(width * bbox[2])
+            h_min = int(height * bbox[1])
+            h_max = int(height * bbox[3])
+            guidance_mask[h_min: h_max, w_min: w_max] = 1.0
+            guidance_masks.append(guidance_mask[None, ...])
+            in_box.append([bbox[0], bbox[2], bbox[1], bbox[3]])
+        # Construct Background Guidance Mask
+        sup_mask = get_sup_mask(guidance_masks)
+        supplement_mask = torch.from_numpy(sup_mask[None, ...])
+        supplement_mask = F.interpolate(supplement_mask, (height//8, width//8), mode='bilinear').float()
+        supplement_mask = supplement_mask.to(hidden_states.device)  # (1, 1, H, W)
+        guidance_masks = np.concatenate(guidance_masks, axis=0)
+        guidance_masks = guidance_masks[None, ...]
+        guidance_masks = torch.from_numpy(guidance_masks).float().to(cond_ca_output.device)
+        guidance_masks = F.interpolate(guidance_masks, (height//8, width//8), mode='bilinear')  # (1, instance_num, H, W)
+        in_box = torch.from_numpy(np.array(in_box))[None, ...].float().to(cond_ca_output.device)  # (1, instance_num, 4)
+        other_info = {}
+        other_info['image_token'] = hidden_states_cond[None, ...]
+        other_info['context'] = encoder_hidden_states[1:, ...]
+        other_info['box'] = in_box
+        other_info['context_pooler'] =embeds_pooler  # (instance_num, 1, 768)
+        other_info['supplement_mask'] = supplement_mask
+        other_info['attn2'] = None
+        other_info['attn'] = attn
+        other_info['height'] = height
+        other_info['width'] = width
+        other_info['ca_scale'] = ca_scale
+        other_info['ea_scale'] = ea_scale
+        other_info['sac_scale'] = sac_scale
+        if not not_use_migc:
+            hidden_states_cond, fuser_info = self.migc(cond_ca_output,
+                                            guidance_masks,
+                                            other_info=other_info,
+                                            return_fuser_info=True)
+        else:
+            hidden_states_cond, fuser_info = self.naive_fuser(cond_ca_output,
+                                            guidance_masks,
+                                            other_info=other_info,
+                                            return_fuser_info=True)
+        hidden_states_cond = hidden_states_cond.squeeze(1)
+        hidden_states = torch.cat([hidden_states_uncond, hidden_states_cond])
+        return hidden_states
+class StableDiffusionMIGCPipeline(StableDiffusionPipeline):
+    def __init__(
+            self,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            unet: UNet2DConditionModel,
+            scheduler: KarrasDiffusionSchedulers,
+            safety_checker: StableDiffusionSafetyChecker,
+            feature_extractor: CLIPImageProcessor,
+            image_encoder: CLIPVisionModelWithProjection = None,
+            requires_safety_checker: bool = True,
+    ):
+        # Get the parameter signature of the parent class constructor
+        parent_init_signature = inspect.signature(super().__init__)
+        parent_init_params = parent_init_signature.parameters
+        # Dynamically build a parameter dictionary based on the parameters of the parent class constructor
+        init_kwargs = {
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "safety_checker": safety_checker,
+            "feature_extractor": feature_extractor,
+            "requires_safety_checker": requires_safety_checker
+        }
+        if 'image_encoder' in parent_init_params.items():
+            init_kwargs['image_encoder'] = image_encoder
+        super().__init__(**init_kwargs)
+        self.instance_set = set()
+        self.embedding = {}
+    def _encode_prompt(
+            self,
+            prompts,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompts is not None and isinstance(prompts, str):
+            batch_size = 1
+        elif prompts is not None and isinstance(prompts, list):
+            batch_size = len(prompts)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_embeds_none_flag = (prompt_embeds is None)
+        prompt_embeds_list = []
+        embeds_pooler_list = []
+        for prompt in prompts:
+            if prompt_embeds_none_flag:
+                # textual inversion: procecss multi-vector tokens if necessary
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+                text_inputs = self.tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = self.tokenizer(
+                    prompt, padding="longest", return_tensors="pt"
+                ).input_ids
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                    -1
+                ] and not torch.equal(text_input_ids, untruncated_ids):
+                    removed_text = self.tokenizer.batch_decode(
+                        untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
+                    )
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                if (
+                        hasattr(self.text_encoder.config, "use_attention_mask")
+                        and self.text_encoder.config.use_attention_mask
+                ):
+                    attention_mask = text_inputs.attention_mask.to(device)
+                else:
+                    attention_mask = None
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device),
+                    attention_mask=attention_mask,
+                )
+                embeds_pooler = prompt_embeds.pooler_output
+                prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            embeds_pooler = embeds_pooler.to(dtype=self.text_encoder.dtype, device=device)
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            embeds_pooler = embeds_pooler.repeat(1, num_images_per_prompt)
+            prompt_embeds = prompt_embeds.view(
+                bs_embed * num_images_per_prompt, seq_len, -1
+            )
+            embeds_pooler = embeds_pooler.view(
+                bs_embed * num_images_per_prompt, -1
+            )
+            prompt_embeds_list.append(prompt_embeds)
+            embeds_pooler_list.append(embeds_pooler)
+        prompt_embeds = torch.cat(prompt_embeds_list, dim=0)
+        embeds_pooler = torch.cat(embeds_pooler_list, dim=0)
+        # negative_prompt_embeds: (prompt_nums[0]+prompt_nums[1]+...prompt_nums[n], token_num, token_channel), <class 'torch.Tensor'>
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                negative_prompt = "worst quality, low quality, bad anatomy"
+            uncond_tokens = [negative_prompt] * batch_size
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if (
+                    hasattr(self.text_encoder.config, "use_attention_mask")
+                    and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # negative_prompt_embeds: (len(prompt_nums), token_num, token_channel), <class 'torch.Tensor'>
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            final_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return final_prompt_embeds, prompt_embeds, embeds_pooler[:, None, :]
+    def check_inputs(
+            self,
+            prompt,
+            token_indices,
+            bboxes,
+            height,
+            width,
+            callback_steps,
+            negative_prompt=None,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if (callback_steps is None) or (
+                callback_steps is not None
+                and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+                not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if token_indices is not None:
+            if isinstance(token_indices, list):
+                if isinstance(token_indices[0], list):
+                    if isinstance(token_indices[0][0], list):
+                        token_indices_batch_size = len(token_indices)
+                    elif isinstance(token_indices[0][0], int):
+                        token_indices_batch_size = 1
+                    else:
+                        raise TypeError(
+                            "`token_indices` must be a list of lists of integers or a list of integers."
+                        )
+                else:
+                    raise TypeError(
+                        "`token_indices` must be a list of lists of integers or a list of integers."
+                    )
+            else:
+                raise TypeError(
+                    "`token_indices` must be a list of lists of integers or a list of integers."
+                )
+        if bboxes is not None:
+            if isinstance(bboxes, list):
+                if isinstance(bboxes[0], list):
+                    if (
+                            isinstance(bboxes[0][0], list)
+                            and len(bboxes[0][0]) == 4
+                            and all(isinstance(x, float) for x in bboxes[0][0])
+                    ):
+                        bboxes_batch_size = len(bboxes)
+                    elif (
+                            isinstance(bboxes[0], list)
+                            and len(bboxes[0]) == 4
+                            and all(isinstance(x, float) for x in bboxes[0])
+                    ):
+                        bboxes_batch_size = 1
+                    else:
+                        print(isinstance(bboxes[0], list), len(bboxes[0]))
+                        raise TypeError(
+                            "`bboxes` must be a list of lists of list with four floats or a list of tuples with four floats."
+                        )
+                else:
+                    print(isinstance(bboxes[0], list), len(bboxes[0]))
+                    raise TypeError(
+                        "`bboxes` must be a list of lists of list with four floats or a list of tuples with four floats."
+                    )
+            else:
+                print(isinstance(bboxes[0], list), len(bboxes[0]))
+                raise TypeError(
+                    "`bboxes` must be a list of lists of list with four floats or a list of tuples with four floats."
+                )
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+        if token_indices_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"token indices batch size must be same as prompt batch size. token indices batch size: {token_indices_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+        if bboxes_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"bbox batch size must be same as prompt batch size. bbox batch size: {bboxes_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+    def get_indices(self, prompt: str) -> Dict[str, int]:
+        """Utility function to list the indices of the tokens you wish to alte"""
+        ids = self.tokenizer(prompt).input_ids
+        indices = {
+            i: tok
+            for tok, i in zip(
+                self.tokenizer.convert_ids_to_tokens(ids), range(len(ids))
+            )
+        }
+        return indices
+    @staticmethod
+    def draw_box(pil_img: Image, bboxes: List[List[float]]) -> Image:
+        """Utility function to draw bbox on the image"""
+        width, height = pil_img.size
+        draw = ImageDraw.Draw(pil_img)
+        for obj_box in bboxes:
+            x_min, y_min, x_max, y_max = (
+                obj_box[0] * width,
+                obj_box[1] * height,
+                obj_box[2] * width,
+                obj_box[3] * height,
+            )
+            draw.rectangle(
+                [int(x_min), int(y_min), int(x_max), int(y_max)],
+                outline="red",
+                width=4,
+            )
+        return pil_img
+    @staticmethod
+    def draw_box_desc(pil_img: Image, bboxes: List[List[float]], prompt: List[str]) -> Image:
+        """Utility function to draw bbox on the image"""
+        color_list = ['red', 'blue', 'yellow', 'purple', 'green', 'black', 'brown', 'orange', 'white', 'gray']
+        width, height = pil_img.size
+        draw = ImageDraw.Draw(pil_img)
+        font_folder = os.path.dirname(os.path.dirname(__file__))
+        font_path = os.path.join(font_folder, 'Rainbow-Party-2.ttf')
+        font = ImageFont.truetype(font_path, 30)
+        for box_id in range(len(bboxes)):
+            obj_box = bboxes[box_id]
+            text = prompt[box_id]
+            fill = 'black'
+            for color in prompt[box_id].split(' '):
+                if color in color_list:
+                    fill = color
+            text = text.split(',')[0]
+            x_min, y_min, x_max, y_max = (
+                obj_box[0] * width,
+                obj_box[1] * height,
+                obj_box[2] * width,
+                obj_box[3] * height,
+            )
+            draw.rectangle(
+                [int(x_min), int(y_min), int(x_max), int(y_max)],
+                outline=fill,
+                width=4,
+            )
+            draw.text((int(x_min), int(y_min)), text, fill=fill, font=font)
+        return pil_img
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: List[List[str]] = None,
+            bboxes: List[List[List[float]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+            callback_steps: int = 1,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            MIGCsteps=20,
+            NaiveFuserSteps=-1,
+            ca_scale=None,
+            ea_scale=None,
+            sac_scale=None,
+            aug_phase_with_and=False,
+            sa_preserve=False,
+            use_sa_preserve=False,
+            clear_set=False,
+            GUI_progress=None
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            token_indices (Union[List[List[List[int]]], List[List[int]]], optional):
+                The list of the indexes in the prompt to layout. Defaults to None.
+            bboxes (Union[List[List[List[float]]], List[List[float]]], optional):
+                The bounding boxes of the indexes to maintain layout in the image. Defaults to None.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            max_guidance_iter (`int`, *optional*, defaults to `10`):
+                The maximum number of iterations for the layout guidance on attention maps in diffusion mode.
+            max_guidance_iter_per_step (`int`, *optional*, defaults to `5`):
+                The maximum number of iterations to run during each time step for layout guidance.
+            scale_factor (`int`, *optional*, defaults to `50`):
+                The scale factor used to update the latents during optimization.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        def aug_phase_with_and_function(phase, instance_num):
+            instance_num = min(instance_num, 7)
+            copy_phase = [phase] * instance_num
+            phase = ', and '.join(copy_phase)
+            return phase
+        if aug_phase_with_and:
+            instance_num = len(prompt[0]) - 1
+            for i in range(1, len(prompt[0])):
+                prompt[0][i] = aug_phase_with_and_function(prompt[0][i],
+                                                            instance_num)
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_nums = [0] * len(prompt)
+        for i, _ in enumerate(prompt):
+            prompt_nums[i] = len(_)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, cond_prompt_embeds, embeds_pooler = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # print(prompt_embeds.shape)  3 77 768
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        if clear_set:
+            self.instance_set = set()
+            self.embedding = {}
+        now_set = set()
+        for i in range(len(bboxes[0])):
+            now_set.add((tuple(bboxes[0][i]), prompt[0][i + 1]))
+        mask_set = (now_set | self.instance_set) - (now_set & self.instance_set)
+        self.instance_set = now_set
+        guidance_mask = np.full((4, height // 8, width // 8), 1.0)
+        for bbox, _ in mask_set:
+            w_min = max(0, int(width * bbox[0] // 8) - 5)
+            w_max = min(width, int(width * bbox[2] // 8) + 5)
+            h_min = max(0, int(height * bbox[1] // 8) - 5)
+            h_max = min(height, int(height * bbox[3] // 8) + 5)
+            guidance_mask[:, h_min:h_max, w_min:w_max] = 0
+        kernal_size = 5
+        guidance_mask = uniform_filter(
+            guidance_mask, axes = (1, 2), size = kernal_size
+        )
+        guidance_mask = torch.from_numpy(guidance_mask).to(self.device).unsqueeze(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if GUI_progress is not None:
+                    GUI_progress[0] = int((i + 1) / len(timesteps) * 100)
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # predict the noise residual
+                cross_attention_kwargs = {'prompt_nums': prompt_nums,
+                                          'bboxes': bboxes,
+                                          'ith': i,
+                                          'embeds_pooler': embeds_pooler,
+                                          'timestep': t,
+                                          'height': height,
+                                          'width': width,
+                                          'MIGCsteps': MIGCsteps,
+                                          'NaiveFuserSteps': NaiveFuserSteps,
+                                          'ca_scale': ca_scale,
+                                          'ea_scale': ea_scale,
+                                          'sac_scale': sac_scale,
+                                          'sa_preserve': sa_preserve,
+                                          'use_sa_preserve': use_sa_preserve}
+                self.unet.eval()
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                            noise_pred_text - noise_pred_uncond
+                    )
+                step_output = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                )
+                latents = step_output.prev_sample
+                ori_input = latents.detach().clone()
+                if use_sa_preserve and i in self.embedding:
+                    latents = (
+                            latents * (1.0 - guidance_mask)
+                            + torch.from_numpy(self.embedding[i]).to(latents.device) * guidance_mask
+                        ).float()
+                if sa_preserve:
+                    self.embedding[i] = ori_input.cpu().numpy()
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            image = latents
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, None)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=None
+        )

migc/migc_utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import argparse
+import numpy as np
+import torch
+import os
+import yaml
+import random
+from diffusers.utils.import_utils import is_accelerate_available
+from transformers import CLIPTextModel, CLIPTokenizer
+from migc.migc_pipeline import StableDiffusionMIGCPipeline, MIGCProcessor, AttentionStore
+from diffusers import EulerDiscreteScheduler
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+from contextlib import nullcontext
+def seed_everything(seed):
+    # np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+import torch
+from typing import Callable, Dict, List, Optional, Union
+from collections import defaultdict
+LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
+# We need to set Attention Processors for the following keys.
+all_processor_keys = [
+    'down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor', 'down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor',
+    'down_blocks.0.attentions.1.transformer_blocks.0.attn1.processor', 'down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor',
+    'down_blocks.1.attentions.0.transformer_blocks.0.attn1.processor', 'down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor',
+    'down_blocks.1.attentions.1.transformer_blocks.0.attn1.processor', 'down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor',
+    'down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor', 'down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor',
+    'down_blocks.2.attentions.1.transformer_blocks.0.attn1.processor', 'down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor',
+    'up_blocks.1.attentions.0.transformer_blocks.0.attn1.processor', 'up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor',
+    'up_blocks.1.attentions.1.transformer_blocks.0.attn1.processor', 'up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor',
+    'up_blocks.1.attentions.2.transformer_blocks.0.attn1.processor', 'up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor',
+    'up_blocks.2.attentions.0.transformer_blocks.0.attn1.processor', 'up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor',
+    'up_blocks.2.attentions.1.transformer_blocks.0.attn1.processor', 'up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor',
+    'up_blocks.2.attentions.2.transformer_blocks.0.attn1.processor', 'up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor',
+    'up_blocks.3.attentions.0.transformer_blocks.0.attn1.processor', 'up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor',
+    'up_blocks.3.attentions.1.transformer_blocks.0.attn1.processor', 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor',
+    'up_blocks.3.attentions.2.transformer_blocks.0.attn1.processor', 'up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor',
+    'mid_block.attentions.0.transformer_blocks.0.attn1.processor', 'mid_block.attentions.0.transformer_blocks.0.attn2.processor'
+]
+def load_migc(unet, attention_store, pretrained_MIGC_path: Union[str, Dict[str, torch.Tensor]], attn_processor,
+                        **kwargs):
+    state_dict = torch.load(pretrained_MIGC_path, map_location="cpu")
+    # fill attn processors
+    attn_processors = {}
+    state_dict = state_dict['state_dict']
+    adapter_grouped_dict = defaultdict(dict)
+    # change the key of MIGC.ckpt as the form of diffusers unet
+    for key, value in state_dict.items():
+        key_list = key.split(".")
+        assert 'migc' in key_list
+        if 'input_blocks' in key_list:
+            model_type = 'down_blocks'
+        elif 'middle_block' in key_list:
+            model_type = 'mid_block'
+        else:
+            model_type = 'up_blocks'
+        index_number = int(key_list[3])
+        if model_type == 'down_blocks':
+            input_num1 = str(index_number//3)
+            input_num2 = str((index_number%3)-1)
+        elif model_type == 'mid_block':
+            input_num1 = '0'
+            input_num2 = '0'
+        else:
+            input_num1 = str(index_number//3)
+            input_num2 = str(index_number%3)
+        attn_key_list = [model_type,input_num1,'attentions',input_num2,'transformer_blocks','0']
+        if model_type == 'mid_block':
+            attn_key_list = [model_type,'attentions',input_num2,'transformer_blocks','0']
+        attn_processor_key = '.'.join(attn_key_list)
+        sub_key = '.'.join(key_list[key_list.index('migc'):])
+        adapter_grouped_dict[attn_processor_key][sub_key] = value
+    # Create MIGC Processor
+    config = {'not_use_migc': False}
+    for key, value_dict in adapter_grouped_dict.items():
+        dim = value_dict['migc.norm.bias'].shape[0]
+        config['C'] = dim
+        key_final = key + '.attn2.processor'
+        if key_final.startswith("mid_block"):
+            place_in_unet = "mid"
+        elif key_final.startswith("up_blocks"):
+            place_in_unet = "up"
+        elif key_final.startswith("down_blocks"):
+            place_in_unet = "down"
+        attn_processors[key_final] = attn_processor(config, attention_store, place_in_unet)
+        attn_processors[key_final].load_state_dict(value_dict)
+        attn_processors[key_final].to(device=unet.device, dtype=unet.dtype)
+    # Create CrossAttention/SelfAttention Processor
+    config = {'not_use_migc': True}
+    for key in all_processor_keys:
+        if key not in attn_processors.keys():
+            if key.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif key.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif key.startswith("down_blocks"):
+                place_in_unet = "down"
+            attn_processors[key] = attn_processor(config, attention_store, place_in_unet)
+    unet.set_attn_processor(attn_processors)
+    attention_store.num_att_layers = 32
+def offlinePipelineSetupWithSafeTensor(sd_safetensors_path):
+    project_dir = os.path.dirname(os.path.dirname(__file__))
+    migc_ckpt_path = os.path.join(project_dir, 'pretrained_weights/MIGC_SD14.ckpt')
+    clip_model_path = os.path.join(project_dir, 'migc_gui_weights/clip/text_encoder')
+    clip_tokenizer_path = os.path.join(project_dir, 'migc_gui_weights/clip/tokenizer')
+    original_config_file = os.path.join(project_dir, 'migc_gui_weights/v1-inference.yaml')
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        # text_encoder = CLIPTextModel(config)
+        text_encoder = CLIPTextModel.from_pretrained(clip_model_path)
+        tokenizer = CLIPTokenizer.from_pretrained(clip_tokenizer_path)
+    pipe = StableDiffusionMIGCPipeline.from_single_file(sd_safetensors_path,
+                                                    original_config_file=original_config_file,
+                                                    text_encoder=text_encoder,
+                                                    tokenizer=tokenizer,
+                                                    load_safety_checker=False)
+    print('Initializing pipeline')
+    pipe.attention_store = AttentionStore()
+    from migc.migc_utils import load_migc
+    load_migc(pipe.unet , pipe.attention_store,
+            migc_ckpt_path, attn_processor=MIGCProcessor)
+    pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    return pipe

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ diffusers
 invisible_watermark
 torch
 transformers
-xformers

 invisible_watermark
 torch
 transformers
+xformers
+einops
+scipy