Spaces:

Anony100
/

FashionM3

Configuration error

App Files Files Community

pangkaicheng commited on Apr 28

Commit

f8a73ec

0 Parent(s):

first commit

Browse files

Files changed (25) hide show

.gitignore +15 -0
.python-version +1 -0
README.md +13 -0
chainlit.md +14 -0
chainlit_app.py +159 -0
mcp_client.py +110 -0
mcp_server_config.json +16 -0
mcp_servers/fashion_vlm/__init__.py +0 -0
mcp_servers/fashion_vlm/fashion_vlm_infer.py +157 -0
mcp_servers/fashion_vlm/main.py +592 -0
mcp_servers/fashion_vlm/models/__init__.py +4 -0
mcp_servers/fashion_vlm/models/clip_encoder.py +140 -0
mcp_servers/fashion_vlm/models/common_modules.py +357 -0
mcp_servers/fashion_vlm/models/misc.py +53 -0
mcp_servers/fashion_vlm/models/modeling_magvitv2.py +440 -0
mcp_servers/fashion_vlm/models/modeling_showo.py +237 -0
mcp_servers/fashion_vlm/models/modeling_utils.py +1207 -0
mcp_servers/fashion_vlm/models/phi.py +1489 -0
mcp_servers/fashion_vlm/models/sampling.py +118 -0
mcp_servers/fashion_vlm/prompting_utils.py +628 -0
mcp_servers/product_user_database.py +463 -0
mcp_servers/virtual_try_on.py +94 -0
requirements.txt +228 -0
system_message.py +9 -0
utils.py +74 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv/
+.env
+generated_images/
+.chainlit/
+.idea/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+FashionRec Dataset download is required
+.env file is required, including
+CHAINLIT_PORT=8888
+PROXY=http://127.0.0.1:10809
+OPENAI_API_KEY=
+GEMINI_API_KEY=
+FASHION_DATA_ROOT=path_to_FashionRec
+GEN_IMG_DIR="./generated_images"

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

chainlit_app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import json
+import os
+import pandas as pd
+import uuid
+from openai import AsyncOpenAI
+from dotenv import load_dotenv
+import chainlit as cl
+from system_message import SYSTEM_MESSAGE
+from mcp_client import MCPClient
+from utils import create_image_grid
+import httpx
+# Load environment variables from .env
+load_dotenv()
+# Initialize OpenAI client
+CHAINLIT_PORT = os.getenv("CHAINLIT_PORT", "8888")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
+FASHION_DATA_ROOT = os.getenv("FASHION_DATA_ROOT")
+PROXY = os.getenv("PROXY")
+items_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/items_lite.parquet")
+item_id_set = set(items_df.item_id)
+http_client = httpx.AsyncClient(proxy=PROXY) if PROXY else httpx.Client()
+class FashionAgent:
+    def __init__(self, user_id=None):
+        self.mcp_client = MCPClient("mcp_server_config.json", user_id)
+        self.openai = AsyncOpenAI(api_key=OPENAI_API_KEY, http_client=http_client)
+        self.user_id = user_id
+# 全局 FashionAgent 实例
+agent = FashionAgent(user_id=None)
+@cl.on_chat_start
+async def on_chat_start():
+    await agent.mcp_client.connect_to_servers()
+    cl.user_session.set("agent", agent)
+    await cl.Message(content="Hello Sophia! Welcome to FashionM3. How can I assist you today?").send()
+@cl.on_message
+async def on_message(message: cl.Message):
+    agent = cl.user_session.get("agent")
+    user_id = cl.user_session.get("user_id")
+    chat_history = cl.user_session.get("chat_history", [])
+    user_message = message.content
+    upload_image = [x.path for x in message.elements if isinstance(x, cl.Image)]
+    if len(upload_image) == 1:
+        user_message += f"\nThe uploaded image path is: {os.path.abspath(upload_image[0])}"
+    elif len(upload_image) > 1:
+        merged_image_path = f".files/{uuid.uuid4().hex}.jpg"
+        create_image_grid(upload_image[:4], merged_image_path)
+        user_message += f"\nThe uploaded image path is: {os.path.abspath(merged_image_path)}"
+    image_in_database = []
+    for image in message.elements:
+        if isinstance(image, cl.Image):
+            item_id = image.name.split(".")[0]
+            if item_id in item_id_set:
+                image_in_database.append(item_id)
+    if len(image_in_database) > 0:
+        user_message += f"\nUser id is: {user_id}"
+        user_message += f"\nlist_of_items are: {image_in_database}"
+    elif user_id:
+        user_message += f"\nUser id is: {user_id}"
+    # Prepare messages for OpenAI API
+    messages = [
+        {"role": "system", "content": SYSTEM_MESSAGE},
+        *[{"role": "user" if isinstance(msg, cl.Message) else "assistant", "content": msg.content} for msg in chat_history],
+        {"role": "user", "content": user_message}
+    ]
+    # Fetch available tools
+    available_tools = await agent.mcp_client.get_tools()
+    # Initial OpenAI API call
+    response = await agent.openai.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=messages,
+        max_tokens=1000,
+        tools=available_tools if available_tools else None,
+        tool_choice="auto" if available_tools else None
+    )
+    # Process the response
+    response_message = response.choices[0].message
+    if response_message.tool_calls:
+        # Handle tool calls
+        for tool_call in response_message.tool_calls:
+            tool_name = tool_call.function.name
+            params = json.loads(tool_call.function.arguments)
+            try:
+                print(f"Agent execute {tool_name} with params: {params}")
+                result = await agent.mcp_client.execute_tool(tool_name, params)
+                if tool_name == "retrieve_image":
+                    image_path = json.loads(result['result'][0].text)['image_path']
+                    similarity = json.loads(result['result'][0].text)['similarity']
+                    output = f"I found a matching fashion item with a similarity score of {similarity:.2f}"
+                    images = [cl.Image(path=image_path, name="Product image", display="inline", size="medium")]
+                    await cl.Message(content=output, elements=images, author="Fashion Agent").send()
+                if tool_name == "image_generate":
+                    image_path = result['result'][0].text
+                    images = [cl.Image(path=image_path, name="Product image", display="inline", size="medium")]
+                    output = f"Here is the generated image."
+                    await cl.Message(content=output, elements=images, author="Fashion Agent").send()
+                if tool_name == "fashion_recommend_without_image":
+                    output = result['result'][0].text
+                    await cl.Message(content=output, author="Fashion Agent").send()
+                if tool_name == "fashion_recommend":
+                    output = json.loads(result['result'][0].text)['recommendation']
+                    # user_preference = json.loads(result['result'][0].text)['user_preference']
+                    # await cl.Message(content=user_preference, author="Fashion Agent").send()
+                    await cl.Message(content=output, author="Fashion Agent").send()
+                if tool_name == "try_on":
+                    image_path = result['result'][0].text
+                    images = [cl.Image(path=image_path, name="Try-on image", display="inline", size="large")]
+                    output = f"Here is the virtual try-on image."
+                    await cl.Message(content=output, elements=images, author="Fashion Agent").send()
+                else:
+                    output = result
+            except Exception as e:
+                output = f"Error executing tool {tool_name}: {str(e)}"
+            # Update chat history
+            chat_history.append(cl.Message(content=message.content, author="user"))
+            chat_history.append(cl.Message(content=output, author="assistant"))
+            cl.user_session.set("chat_history", chat_history)
+    else:
+        # Direct response from the model
+        output = response_message.content
+        chat_history.append(cl.Message(content=message.content, author="user"))
+        chat_history.append(cl.Message(content=output, author="assistant"))
+        cl.user_session.set("chat_history", chat_history)
+        await cl.Message(content=output, author="Fashion Agent").send()
+@cl.on_chat_end
+def on_chat_end():
+    print("Goodbye", cl.user_session.get("id"))
+if __name__ == "__main__":
+    from chainlit.cli import run_chainlit
+    os.environ["CHAINLIT_PORT"] = CHAINLIT_PORT
+    run_chainlit(__file__)

mcp_client.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import Dict, Any, List, Optional
+from contextlib import AsyncExitStack
+import json
+import aiohttp
+import asyncio
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+from mcp.client.sse import sse_client
+class MCPClient:
+    def __init__(self, config_path: str, user_id: Optional[str] = None):
+        """
+        Initialize MCPClient with a list of server configurations.
+        Each config should be a dict with 'path' (script path) and optionally 'type' (python/node).
+        """
+        self.user_id = user_id
+        with open(config_path, 'r') as f:
+            self.server_configs = json.load(f)['mcpServers']
+        self.sessions: Dict[str, Any] = {}  # 存储 stdio 的 ClientSession 或 sse 的 aiohttp session
+        self.exit_stack = AsyncExitStack()
+    async def connect_to_servers(self):
+        """Connect to all configured MCP servers based on their transport type."""
+        for server_name, config in self.server_configs.items():
+            transport = config.get("transport", "stdio")  # 默认使用 stdio
+            print(f"Connecting to {server_name} ({transport})...")
+            if transport == "stdio":
+                command = config.get("command")
+                args = config.get("args", [])
+                env = config.get("env", None)
+                if not command:
+                    raise ValueError(f"No command specified for server {server_name}")
+                server_params = StdioServerParameters(command=command, args=args, env=env)
+                stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
+                stdio, write = stdio_transport
+                session = await self.exit_stack.enter_async_context(ClientSession(stdio, write))
+                await session.initialize()
+                self.sessions[server_name] = session
+                # self.stdio_transports[server_name] = (stdio, write)
+            elif transport == "sse":
+                server_url = config.get("url", "")
+                if not server_url:
+                    raise ValueError(f"No base_url specified for server {server_name}")
+                # 建立 SSE 连接
+                streams_context = sse_client(url=f"{server_url}/sse")
+                streams = await self.exit_stack.enter_async_context(streams_context)
+                session_context = ClientSession(*streams)
+                session = await self.exit_stack.enter_async_context(session_context)
+                # 初始化会话
+                await session.initialize()
+                self.sessions[server_name] = session
+                # self.sse_contexts[server_name] = (streams_context, session_context)
+                # 验证连接
+                print(f"Initialized SSE client for {server_name}...")
+                print("Listing tools...")
+                response = await session.list_tools()
+                tools = response.tools
+                print(f"Connected to {server_name} with tools:", [tool.name for tool in tools])
+            else:
+                raise ValueError(f"Unsupported transport type '{transport}' for {server_name}")
+    async def get_tools(self) -> List[Dict[str, Any]]:
+        """
+        Fetch the list of available tools from all connected MCP servers.
+        Returns a list of tool definitions with name, description, and inputSchema.
+        """
+        all_tools = []
+        for server_name, session in self.sessions.items():
+            response = await session.list_tools()
+            for tool in response.tools:
+                if not self.user_id and tool.name == 'personalized_fashion_recommend':
+                    continue
+                all_tools.append(
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": tool.name,
+                            "description": tool.description,
+                            "parameters": tool.inputSchema
+                        }
+                    }
+                )
+        return all_tools
+    async def execute_tool(self, tool_name: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a tool with the given parameters on the appropriate server.
+        """
+        # Find which server has this tool
+        for server_name, session in self.sessions.items():
+            response = await session.list_tools()
+            for tool in response.tools:
+                if tool.name == tool_name:
+                    # Execute the tool on the correct server
+                    result = await session.call_tool(tool_name, params)
+                    return {
+                        "result": result.content,
+                        "server": server_name
+                    }
+        raise Exception(f"Tool {tool_name} not found on any connected server")
+    async def close(self):
+        """Close all server connections."""
+        await self.exit_stack.aclose()

mcp_server_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "mcpServers": {
+        "fashion_vlm": {
+            "transport": "sse",
+            "url": "http://localhost:8000"
+        },
+        "virtual_try_on": {
+            "command": "python",
+            "args": ["mcp_servers/virtual_try_on.py"],
+            "env": {
+                "HTTP_PROXY": "http://127.0.0.1:10809",
+                "HTTPS_PROXY": "http://127.0.0.1:10809"
+            }
+        }
+    }
+}

mcp_servers/fashion_vlm/__init__.py ADDED Viewed

File without changes

mcp_servers/fashion_vlm/fashion_vlm_infer.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from typing import List
+import os
+import datetime
+from omegaconf import OmegaConf
+from tqdm import tqdm
+import numpy as np
+import torch
+from torch import tensor
+from torchvision import transforms
+from PIL import Image
+from models import Showo, MAGVITv2, CLIPVisionTower, get_mask_chedule
+from prompting_utils import (UniversalPrompting,
+                             create_attention_mask_for_mmu,
+                             create_attention_mask_predict_next)
+from transformers import AutoTokenizer, CLIPImageProcessor
+def image_transform(image, resolution=256, normalize=True):
+    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC)(image)
+    image = transforms.CenterCrop((resolution, resolution))(image)
+    image = transforms.ToTensor()(image)
+    if normalize:
+        image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)
+    return image
+class FashionVLM:
+    def __init__(self, temperature, top_k, max_new_tokens, fashion_vlm_name,
+                 batch_size=3, guidance_scale=5, generation_temperature=1.0, generation_timesteps=50,
+                 save_dir="generated_images"):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.temperature = temperature  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+        self.top_k = top_k  # retain only the top_k most likely tokens, clamp others to have 0 probability
+        self.max_new_tokens = max_new_tokens
+        self.fashion_vlm_name = fashion_vlm_name
+        #param for t2i
+        self.save_dir = save_dir
+        self.batch_size = batch_size
+        self.guidance_scale = guidance_scale
+        self.generation_temperature = generation_temperature
+        self.generation_timesteps = generation_timesteps
+        self._init_models()
+    def _init_models(self):
+        # 初始化Universal Prompting
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "microsoft/phi-1_5",
+            padding_side="left"
+        )
+        self.uni_prompting = UniversalPrompting(
+            self.tokenizer,
+            max_text_len=381,
+            special_tokens=(
+                "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"
+            ),
+            ignore_id=-100, cond_dropout_prob=0.1
+        )
+        # 初始化VQ模型
+        self.vq_model = MAGVITv2.from_pretrained("showlab/magvitv2").to(self.device)
+        self.vq_model.requires_grad_(False)
+        self.vq_model.eval()
+        self.model = Showo.from_pretrained(self.fashion_vlm_name).to(self.device)
+        self.model.eval()
+    def mmu_infer_tensor(self, image: tensor, prompt: tensor):
+        """
+        Image size: batch * 3 * 256 * 256
+        """
+        image = image.to(self.device)
+        prompt = prompt.to(self.device)
+        # pixel_values = self.clip_image_processor.preprocess(image_ori, return_tensors="pt")["pixel_values"][0]
+        image_tokens = self.vq_model.get_code(image) + len(self.uni_prompting.text_tokenizer)
+        input_ids = torch.cat([
+            (torch.ones(prompt.shape[0], 1) * self.uni_prompting.sptids_dict['<|mmu|>']).to(self.device),
+            (torch.ones(prompt.shape[0], 1) * self.uni_prompting.sptids_dict['<|soi|>']).to(self.device),
+            image_tokens,
+            (torch.ones(prompt.shape[0], 1) * self.uni_prompting.sptids_dict['<|eoi|>']).to(self.device),
+            (torch.ones(prompt.shape[0], 1) * self.uni_prompting.sptids_dict['<|sot|>']).to(self.device),
+            prompt
+        ], dim=1).long()
+        attention_mask = create_attention_mask_for_mmu(
+            input_ids.to(self.device),
+            eoi_id=int(self.uni_prompting.sptids_dict['<|eoi|>'])
+        )
+        cont_toks_list = self.model.mmu_generate(
+            input_ids, attention_mask=attention_mask,
+            max_new_tokens=self.max_new_tokens, top_k=self.top_k,
+            eot_token=self.uni_prompting.sptids_dict['<|eot|>']
+        )
+        cont_toks_list = torch.stack(cont_toks_list).squeeze()[None]
+        text = self.uni_prompting.text_tokenizer.batch_decode(cont_toks_list, skip_special_tokens=True)
+        return text
+    def t2i_infer(self, prompts: List[str]):
+        output_path = []
+        for step in tqdm(range(0, len(prompts), self.batch_size)):
+            batch_prompt = prompts[step:step + self.batch_size]
+            image_tokens = torch.ones((len(batch_prompt), self.model.config.num_vq_tokens),
+                                      dtype=torch.long, device=self.device) * self.model.config.mask_token_id
+            input_ids, _ = self.uni_prompting((batch_prompt, image_tokens), 't2i_gen')
+            if self.guidance_scale > 0:
+                uncond_input_ids, _ = self.uni_prompting(([''] * len(batch_prompt), image_tokens), 't2i_gen')
+                attention_mask = create_attention_mask_predict_next(
+                    torch.cat([input_ids, uncond_input_ids], dim=0),
+                    pad_id=int(self.uni_prompting.sptids_dict['<|pad|>']),
+                    soi_id=int(self.uni_prompting.sptids_dict['<|soi|>']),
+                    eoi_id=int(self.uni_prompting.sptids_dict['<|eoi|>']),
+                    rm_pad_in_image=True
+                )
+            else:
+                uncond_input_ids = None
+                attention_mask = create_attention_mask_predict_next(
+                    input_ids,
+                    pad_id=int(self.uni_prompting.sptids_dict['<|pad|>']),
+                    soi_id=int(self.uni_prompting.sptids_dict['<|soi|>']),
+                    eoi_id=int(self.uni_prompting.sptids_dict['<|eoi|>']),
+                    rm_pad_in_image=True
+                )
+            mask_schedule = get_mask_chedule("cosine")
+            with torch.no_grad():
+                gen_token_ids = self.model.t2i_generate(
+                    input_ids=input_ids,
+                    uncond_input_ids=uncond_input_ids,
+                    attention_mask=attention_mask,
+                    guidance_scale=self.guidance_scale,
+                    temperature=self.generation_temperature,
+                    timesteps=self.generation_timesteps,
+                    noise_schedule=mask_schedule,
+                )
+            gen_token_ids = torch.clamp(gen_token_ids, max=self.model.config.codebook_size - 1, min=0)
+            images = self.vq_model.decode_code(gen_token_ids)
+            images = torch.clamp((images + 1.0) / 2.0, min=0.0, max=1.0)
+            images *= 255.0
+            images = images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+            # 保存图片
+            for idx, image in enumerate(images, start=1):
+                image = Image.fromarray(image)
+                # 使用时间戳和索引创建唯一的文件名
+                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+                image_filename = f"{timestamp}_{step + idx}.jpg"
+                image_path = os.path.join(self.save_dir, image_filename)
+                image.save(image_path)
+                output_path.append(image_path)
+        return output_path

mcp_servers/fashion_vlm/main.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import os
+from dotenv import load_dotenv
+import numpy as np
+from itertools import combinations
+from typing import Dict, Any, List, Optional
+import pandas as pd
+from scipy import sparse
+from sklearn.metrics.pairwise import cosine_similarity
+import pickle
+import uvicorn
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoTokenizer, CLIPProcessor, CLIPModel
+from mcp.server.fastmcp import FastMCP
+from mcp.server.sse import SseServerTransport
+from starlette.routing import Route, Mount
+from starlette.applications import Starlette
+from openai import AsyncOpenAI
+from fashion_vlm_infer import FashionVLM
+from prompting_utils import UniversalPrompting
+# Load environment variables
+load_dotenv()
+FASHION_DATA_ROOT = os.getenv("FASHION_DATA_ROOT", "/mnt/d/PostDoc/fifth paper/code/FashionVLM/datasets/FashionRec")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
+openai = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE)
+VALID_CATEGORIES = [
+    'Pants', 'Coats', 'Cross-body bags', 'Shirts', 'Hats & caps', 'Sneakers', 'Jeans', 'Boots', 'Dresses', 'Sandals',
+    'T-shirts & vests', 'Knitwear', 'Skirts', 'Earrings', 'Hats', 'Sweaters & knitwear', 'Loafers', 'Ballet flats',
+    'Espadrilles', 'Tote bags', 'Shoulder bags', 'Slides & flip flops', 'Pumps', 'Necklaces', 'Polo shirts', 'Suits',
+    'Oxford shoes', 'Bracelets', 'Jackets', 'Tops', 'Rings', 'Mules', 'Luggage & holdalls', 'Brogues', 'Activewear',
+    'Belts', 'Derby shoes', 'Mini bags', 'Watches', 'Backpacks', 'Denim', 'Laptop bags & briefcases', 'Clutch bags',
+    'Clutches', 'Lingerie & Nightwear', 'Skiwear', 'Sunglasses', 'Ties & bow ties', 'Shorts', 'Scarves', 'Messenger bags'
+]
+###################################
+#########Loading Data##############
+###################################
+# Load item metadata
+items_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/items_lite.parquet").set_index("item_id")
+outfits_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/outfits_lite.parquet").set_index("outfit_id")
+users_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/users_lite.parquet").set_index("user_id")
+image_paths = items_df["path"].to_dict()
+###################################
+#########Loading Model#############
+###################################
+# Load CLIP model and processor
+print("Loading CLIP Model")
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", local_files_only=True)
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", local_files_only=True)
+clip_model.eval()
+print("Loading Fashion VLM params")
+fashion_vlm = FashionVLM(
+    max_new_tokens=1000,
+    temperature=0.8,
+    top_k=1,
+    fashion_vlm_name='Anony100/FashionVLM',
+    save_dir="/mnt/d/PostDoc/fifth paper/code/FashionM3/generated_images"
+)
+resolution = 512
+image_transform = transforms.Compose([
+    transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC),
+    transforms.CenterCrop((resolution, resolution)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+])
+print("Loading tokenizer")
+tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-1_5', padding_side="left")
+uni_prompting = UniversalPrompting(
+    tokenizer,
+    max_text_len=128,
+    special_tokens=(
+        "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"
+    ),
+    ignore_id=-100, cond_dropout_prob=0.1
+)
+class InteractionDataManager:
+    def __init__(self, users_df, outfits_df, items_df):
+        """
+        初始化类，加载数据并设置基本参数
+        参数:
+        - users_file: 用户数据文件路径 (parquet)
+        - outfits_file: Outfit 数据文件路径 (parquet)
+        - items_file: 单品数据文件路径 (parquet)
+        """
+        self.users_df = users_df
+        self.outfits_df = outfits_df
+        self.items_df = items_df
+        # 创建映射
+        self.item_id_to_index = {item_id: index for index, item_id in enumerate(self.items_df.index)}
+        self.index_to_item_id = {index: item_id for index, item_id in enumerate(self.items_df.index)}
+        self.user_id_to_index = {user_id: index for index, user_id in enumerate(self.users_df.index)}
+        self.index_to_user_id = {index: user_id for index, user_id in enumerate(self.users_df.index)}
+        self.outfit_ids_dict = self.outfits_df['item_ids'].to_dict()  # get outfit's item ids from outfit id
+        self.item_category_dict = self.items_df['category'].to_dict()  # get item's category from item id
+        self.item_subcategory_dict = self.items_df['subcategory'].to_dict()  # get item's subcategory from item id
+        self.n_items = len(self.items_df)
+        self.n_users = len(self.users_df)
+        self.user_outfit_pairs = []
+        outfit_set = set(self.outfits_df.index)
+        for uid, user in self.users_df.iterrows():
+            oids = user.outfit_ids.split(",")
+            self.user_outfit_pairs.extend([(uid, oid) for oid in oids if oid in outfit_set])
+        # 预处理类别到物品ID的映射（使用groupby）
+        self.subcategory_to_items = self.items_df.groupby('subcategory').apply(lambda x: set(x.index)).to_dict()
+        # 预处理类别到物品索引的映射（优化查找效率）
+        self.subcategory_to_indices = {}
+        for subcategory, item_ids in self.subcategory_to_items.items():
+            self.subcategory_to_indices[subcategory] = set([self.item_id_to_index[item_id]
+                                                            for item_id in item_ids
+                                                            if item_id in self.item_id_to_index])
+        item_interaction_matrix_path = f'{FASHION_DATA_ROOT}/data/personalized_recommendation/temp_matrix/item_matrix.npz'
+        try:
+            self.load_matrix('item', item_interaction_matrix_path)
+        except FileNotFoundError:
+            self.build_item_interaction_matrix()
+            self.save_matrix('item', item_interaction_matrix_path)
+        user_item_interaction_matrix_path = f'{FASHION_DATA_ROOT}/data/personalized_recommendation/temp_matrix/user_item_matrix.npz'
+        try:
+            self.load_matrix('user_item', user_item_interaction_matrix_path)
+        except FileNotFoundError:
+            self.build_user_item_interaction_matrix()
+            self.save_matrix('user_item', user_item_interaction_matrix_path)
+        # 加载item clip features
+        with open(f"{FASHION_DATA_ROOT}/meta/clip_features.pkl", "rb") as f:
+            print("Loading Fashion Features...")
+            self.clip_features = pickle.load(f)
+            print("Loading Fashion Features Successfully")
+        # Prepare embeddings and item IDs
+        self.item_ids = list(self.clip_features.keys())
+        self.image_embeddings = np.array([self.clip_features[item_id]["image_embeds"] for item_id in self.item_ids])
+    def save_matrix(self, matrix_type, filepath):
+        """
+        保存矩阵到文件
+        参数:
+        - matrix_type: 'item' 或 'user_item'，指定保存的矩阵类型
+        - filepath: 保存路径 (例如 'temp/item_matrix.npz')
+        """
+        if matrix_type == 'item':
+            matrix = self.item_interaction_matrix
+        elif matrix_type == 'user_item':
+            matrix = self.user_item_interaction_matrix
+        else:
+            raise ValueError("matrix_type must be 'item' or 'user_item'")
+        if matrix is None:
+            raise ValueError(f"{matrix_type} matrix has not been built yet.")
+        sparse.save_npz(filepath, matrix)
+        print(f"Saved {matrix_type} matrix to {filepath}")
+    def load_matrix(self, matrix_type, filepath):
+        """
+        从文件加载矩阵
+        参数:
+        - matrix_type: 'item' 或 'user_item'，指定加载的矩阵类型
+        - filepath: 加载路径 (例如 'temp/item_matrix.npz')
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File {filepath} does not exist.")
+        matrix = sparse.load_npz(filepath)
+        if matrix_type == 'item':
+            self.item_interaction_matrix = matrix
+        elif matrix_type == 'user_item':
+            self.user_item_interaction_matrix = matrix
+        else:
+            raise ValueError("matrix_type must be 'item' or 'user_item'")
+        print(f"Loaded {matrix_type} matrix from {filepath}")
+        return matrix
+    def build_item_interaction_matrix(self):
+        """构建 Item-Item 交互矩阵"""
+        # 初始化单品交互矩阵
+        self.item_interaction_matrix = sparse.lil_matrix((self.n_items, self.n_items), dtype=int)
+        for index, outfit in self.outfits_df.iterrows():
+            item_ids = outfit['item_ids'].split(',')
+            # 记录 item 对的共现
+            for item_id1, item_id2 in combinations(item_ids, r=2):
+                if item_id1 in self.item_id_to_index and item_id2 in self.item_id_to_index:
+                    idx1 = self.item_id_to_index[item_id1]
+                    idx2 = self.item_id_to_index[item_id2]
+                    self.item_interaction_matrix[idx1, idx2] += 1
+                    self.item_interaction_matrix[idx2, idx1] += 1  # 无序对称
+        # 转换为 CSR 格式
+        self.item_interaction_matrix = self.item_interaction_matrix.tocsr()
+        return self.item_interaction_matrix
+    def build_user_item_interaction_matrix(self):
+        """构建 User-Item 交互矩阵"""
+        # 初始化用户-单品交互矩阵
+        self.user_item_interaction_matrix = sparse.lil_matrix((self.n_users, self.n_items), dtype=int)
+        for uid, user in self.users_df.iterrows():
+            oids = user["outfit_ids"].split(",")
+            outfits = self.outfits_df.loc[self.outfits_df.index.isin(oids)]
+            for oid, outfit in outfits.iterrows():
+                item_ids = outfit['item_ids'].split(',')
+                # 记录 user-item 对的出现
+                for iid in item_ids:
+                    if iid in self.item_id_to_index:
+                        uidx = self.user_id_to_index[uid]
+                        iidx = self.item_id_to_index[iid]
+                        self.user_item_interaction_matrix[uidx, iidx] += 1
+        # 转换为 CSR 格式
+        self.user_item_interaction_matrix = self.user_item_interaction_matrix.tocsr()
+        return self.user_item_interaction_matrix
+    def _process_interactions_for_category(
+            self,
+            matrix,
+            given_id,
+            category_indices,
+            id_to_index
+    ):
+        """
+        处理单个实体与目标类别的交互
+        参数:
+        - matrix: 交互矩阵
+        - given_id: 给定的实体ID（用户或物品）
+        - category_indices: 目标类别的物品索引集合
+        返回:
+        - 交互列表，每个元素为一个包含item_id、interaction_count和score的字典
+        """
+        interactions = []
+        given_index = id_to_index[given_id]
+        row = matrix[given_index]
+        # 提取该行的非零元素
+        row_start = row.indptr[0]
+        row_end = row.indptr[1]
+        col_indices = row.indices[row_start:row_end]
+        data_values = row.data[row_start:row_end]
+        # 筛选出属于目标类别的物品
+        for col_idx, value in zip(col_indices, data_values):
+            # 检查是否为目标类别的物品
+            if col_idx in category_indices:
+                # 获取物品ID
+                output_id = self.index_to_item_id[col_idx]
+                interactions.append({
+                    'item_id': output_id,
+                    'interaction_count': int(value),
+                    'score': 0.0
+                })
+        return interactions
+    def get_item_category_interactions(
+        self,
+        target_category: str,
+        given_ids: List[str],
+        query_type='item',  # item or user
+        top_k=None,
+    ):
+        """
+        获取指定实体（用户或单品）与目标类别的所有交互情况
+        参数:
+        - target_category: 待查询的subcategory
+        - given_ids: List of 目标类别
+        - query_type: 查询的类别， item或user
+        - top_k: 返回交互次数最多的前k个物品, 如果是None直接全部返回
+        返回:
+        - 列表，包含与目标类别的交互统计信息，按交互次数排序
+        """
+        if query_type == 'item':
+            matrix = self.item_interaction_matrix
+            id_to_index = self.item_id_to_index
+        elif query_type == 'user':
+            matrix = self.user_item_interaction_matrix
+            id_to_index = self.user_id_to_index
+        else:
+            print(f'query_type must be either item or user but got {query_type}')
+            return []
+        # 收集所有交互记录
+        all_interactions = []
+        category = target_category
+        category_indices = self.subcategory_to_indices.get(category, set())  # 获取该类别的所有物品索引
+        # 获取该实体的所有交互
+        for given_id in given_ids:
+            interactions = self._process_interactions_for_category(
+                matrix, given_id, category_indices, id_to_index
+            )
+            # 将交互添加到结果列表
+            all_interactions.extend(interactions)
+        # 合并相同物品的交互次数
+        item_interactions = {}
+        for interaction in all_interactions:
+            item_id = interaction['item_id']
+            count = interaction['interaction_count']
+            if item_id in item_interactions:
+                item_interactions[item_id] += count
+            else:
+                item_interactions[item_id] = count
+        # 转换为结果格式
+        merged_interactions = [
+            {'item_id': item_id, 'interaction_count': count, 'score': 0.0}
+            for item_id, count in item_interactions.items()
+        ]
+        # 排序
+        if merged_interactions:
+            merged_interactions.sort(key=lambda x: x['interaction_count'], reverse=True)
+        # 截取top-k
+        if top_k and merged_interactions:
+            merged_interactions = merged_interactions[:top_k]
+        # 存储结果
+        return merged_interactions
+    def rank_by_similarity(self, item_interactions, user_interactions, beta=2.0):
+        """
+        计算用户交互项与商品交互项的相似度并排序
+        """
+        def get_combined_features(feature_dict):
+            return (feature_dict['image_embeds'] + feature_dict['text_embeds']) / 2
+        if not item_interactions:
+            return user_interactions
+        item_feature_list = []
+        for item in item_interactions:
+            item_id = item['item_id']
+            if item_id not in self.clip_features:
+                raise ValueError(f"Didn't find clip feature of item with id: {item_id}")
+            item_features = get_combined_features(self.clip_features[item_id])
+            item_feature_list.append(item_features)
+        weights = np.array([x['interaction_count'] for x in item_interactions], dtype=np.float32)
+        weights = weights / np.sum(weights)
+        item_feature = np.sum(np.stack(item_feature_list, axis=0) * weights[:, np.newaxis], axis=0).reshape(1, -1)
+        max_count = max((user_item.get('interaction_count', 1) for user_item in user_interactions), default=1)
+        for user_item in user_interactions:
+            user_item_id = user_item['item_id']
+            if user_item_id not in self.clip_features:
+                raise ValueError(f"Didn't find clip feature of item with id: {user_item_id}")
+            user_item_features = get_combined_features(self.clip_features[user_item_id]).reshape(1, -1)
+            similarity = cosine_similarity(user_item_features, item_feature).item()
+            interaction_count = user_item['interaction_count']
+            count_factor = (interaction_count / max_count) * beta + 1
+            user_item['score'] = float(similarity) * count_factor
+        user_interactions.sort(key=lambda x: x.get('score', 0), reverse=True)
+        return user_interactions
+data_manager = InteractionDataManager(users_df, outfits_df, items_df)
+mcp = FastMCP('fashion-vlm-server')
+async def compute_text_embedding(text: str) -> np.ndarray:
+    inputs = clip_processor(text=text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        text_embedding = clip_model.get_text_features(**inputs).numpy()
+    return text_embedding / np.linalg.norm(text_embedding, axis=1, keepdims=True)
+async def find_most_similar_image(text_embedding: np.ndarray) -> Dict[str, Any]:
+    similarities = np.dot(data_manager.image_embeddings, text_embedding.T).flatten()
+    most_similar_idx = np.argmax(similarities)
+    most_similar_item_id = data_manager.item_ids[most_similar_idx]
+    return {
+        "image_path": image_paths[most_similar_item_id],
+        "similarity": float(similarities[most_similar_idx])
+    }
+@mcp.tool()
+async def retrieve_image(text: str) -> Dict[str, Any]:
+    """Search for the most similar fashion image based on a text description.
+    Args:
+        text (str): Text description of the fashion item to search.
+    """
+    print(f"Searching for {text}")
+    text_embedding = await compute_text_embedding(text)
+    return await find_most_similar_image(text_embedding)
+def get_recommendation(query, image_path):
+    image = Image.open(image_path).convert("RGB")
+    image = image_transform(image).unsqueeze(0)
+    prompt = uni_prompting.text_tokenizer(['USER: \n' + query + ' ASSISTANT:'])['input_ids'][0]
+    prompt = torch.tensor(prompt).unsqueeze(0)
+    results = fashion_vlm.mmu_infer_tensor(image, prompt)
+    response = results[0]
+    return response
+@mcp.tool()
+async def fashion_recommend(query: str, image_path: str, target_category: str, user_id: Optional[str], list_of_items: List[str]) -> Dict[str, str]:
+    """Generate fashion recommendations based on a user's query and uploaded image.
+    This function processes the recommendation in the following steps:
+    1. Retrieves the user's interaction history for the specified target category using user_id, target_category, and list_of_items.
+    2. Summarizes the user's preferences for the target category by analyzing descriptions of previously interacted fashion items via a language model.
+    3. Appends the summarized preference (as a single sentence) to the query and processes it with the uploaded image using a Fashion Vision-Language Model (VLM).
+    4. Returns the personalized recommendation along with the derived user preference.
+    The target_category is inferred from the query (e.g., "I want a skirt ..." implies "Skirts") and must belong to a predefined list of valid categories.
+    Args:
+        query (str): A complete sentence explicitly stating the user's desired fashion item (e.g., "I want a skirt for summer"). Must be in English.
+        image_path (str): File path to the user-uploaded image, provided via the prompt.
+        target_category (str): The specific fashion category of interest, derived from the query (e.g., "Skirts"). Must be in valid categories.
+        user_id (str): Unique identifier for the user, provided via the prompt.
+        list_of_items (List[str]): List of item IDs used to filter the user's interaction history, provided via the prompt.
+    Returns:
+        Dict[str, str]: A dictionary containing:
+            - "recommendation": The personalized fashion recommendation text.
+            - "user_preference": The summarized user preference sentence.
+    Valid Categories:
+        ['Pants', 'Coats', 'Cross-body bags', 'Shirts', 'Hats & caps', 'Sneakers', 'Jeans', 'Boots', 'Dresses', 'Sandals',
+         'T-shirts & vests', 'Knitwear', 'Skirts', 'Earrings', 'Hats', 'Sweaters & knitwear', 'Loafers', 'Ballet flats',
+         'Espadrilles', 'Tote bags', 'Shoulder bags', 'Slides & flip flops', 'Pumps', 'Necklaces', 'Polo shirts', 'Suits',
+         'Oxford shoes', 'Bracelets', 'Jackets', 'Tops', 'Rings', 'Mules', 'Luggage & holdalls', 'Brogues', 'Activewear',
+         'Belts', 'Derby shoes', 'Mini bags', 'Watches', 'Backpacks', 'Denim', 'Laptop bags & briefcases', 'Clutch bags',
+         'Clutches', 'Lingerie & Nightwear', 'Skiwear', 'Sunglasses', 'Ties & bow ties', 'Shorts', 'Scarves', 'Messenger bags']
+    """
+    def get_item(item_id: str) -> pd.Series:
+        return data_manager.items_df.loc[item_id]
+    # If no image uploaded, we should use fashion_recommend_without_image
+    if image_path == "":
+        recommendation = await fashion_recommend_without_image(query)
+        return {
+            "recommendation": recommendation,
+            "user_preference": ""
+        }
+    # If no user_id provided or user_id not found in database
+    if not user_id or user_id not in data_manager.user_id_to_index.keys():
+        return {
+            "recommendation": get_recommendation(query, image_path),
+            "user_preference": ""
+        }
+    user_preference = ""
+    if target_category in VALID_CATEGORIES:
+        user_interaction_result = data_manager.get_item_category_interactions(
+            target_category, [user_id], query_type='user'
+        )
+        if len(list_of_items) != 0:
+            item_interaction_result = data_manager.get_item_category_interactions(
+                target_category, list_of_items, query_type='item'
+            )
+        else:
+            item_interaction_result = []
+        descriptions_for_summary = []
+        historical_image_path = []
+        if len(user_interaction_result) >= 0:
+            user_interaction_result = data_manager.rank_by_similarity(
+                item_interaction_result,
+                user_interaction_result
+            )
+            for x in user_interaction_result[:5]:
+                item = get_item(x['item_id'])
+                descriptions_for_summary.append(item['gen_description'])
+                historical_image_path.append(os.path.abspath(item['path']))
+        if descriptions_for_summary:
+            user_message = f"Summary user's preference of {target_category} based on following descriptions of fashion items that user brought previously:"
+            for x in descriptions_for_summary:
+                user_message += f"\n{x}"
+            # Get summary using OpenAI API call
+            response = await openai.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": f"You are a user preference summary assistant. Your response is limited in one sentence, staring at 'I prefer ...'"},
+                    {"role": "user", "content": user_message}
+                ],
+                max_tokens=1000,
+            )
+            user_preference = response.choices[0].message.content
+            query += user_preference
+    return {
+        "recommendation": get_recommendation(query, image_path),
+        "user_preference": user_preference
+    }
+@mcp.tool()
+async def fashion_recommend_without_image(query: str) -> str:
+    """Recommend fashion items sorely based on user's query.
+    Output texts of fashion recommendation from model.
+    Args:
+        query (str): User's fashion related query including their recommendation request.
+    """
+    response = await openai.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a fashion stylist. You should answer user's fashion-related question, especially about fashion recommendation."},
+            {"role": "user", "content": query}
+        ],
+        max_tokens=500,
+    )
+    return response.choices[0].message.content
+@mcp.tool()
+async def image_generate(text: str) -> str:
+    """"Generate image based on description. Output is path that saves generated image.
+    Args:
+        text (str): Descriptive text from user. Used for fashion image generation. English ONLY!
+    """
+    output_path = fashion_vlm.t2i_infer([text])[0]
+    output_path = os.path.abspath(output_path)
+    print(f"Generated image saved at {output_path}")
+    return output_path
+# 获取内部 Server 对象
+mcp_server = mcp._mcp_server
+sse_transport = SseServerTransport("/messages/")
+# 处理 SSE 连接
+async def handle_sse(request):
+    print("Handling SSE connection")
+    async with sse_transport.connect_sse(request.scope, request.receive, request._send) as streams:
+        read_stream, write_stream = streams
+        await mcp_server.run(
+            read_stream,
+            write_stream,
+            mcp_server.create_initialization_options(),
+        )
+# 定义路由
+routes = [
+    Route("/sse", endpoint=handle_sse),
+    Mount("/messages/", app=sse_transport.handle_post_message),
+]
+# 创建 Starlette 应用
+starlette_app = Starlette(routes=routes)
+if __name__ == "__main__":
+    print("Starting Fashion VLM server with HTTP and SSE...")
+    uvicorn.run(starlette_app, host="0.0.0.0", port=8000)

mcp_servers/fashion_vlm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .modeling_showo import Showo
+from .modeling_magvitv2 import VQGANEncoder, VQGANDecoder, LFQuantizer, MAGVITv2
+from .sampling import *
+from .clip_encoder import CLIPVisionTower

mcp_servers/fashion_vlm/models/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = -2
+        self.select_feature = "patch"
+        self.load_model()
+        self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class CLIPVisionTowerS2(CLIPVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
+        self.s2_scales = list(map(int, self.s2_scales.split(',')))
+        self.s2_scales.sort()
+        self.s2_split_size = self.s2_scales[0]
+        self.s2_image_size = self.s2_scales[-1]
+        try:
+            from s2wrapper import forward as multiscale_forward
+        except ImportError:
+            raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
+        self.multiscale_forward = multiscale_forward
+        # change resize/crop size in preprocessing to the largest image size in s2_scale
+        if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.image_processor.size['shortest_edge'] = self.s2_image_size
+            self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+        self.image_processor.size['shortest_edge'] = self.s2_image_size
+        self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+        self.is_loaded = True
+    @torch.no_grad()
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
+                image_features.append(image_feature)
+        else:
+            image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.s2_scales)

mcp_servers/fashion_vlm/models/common_modules.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+Modified from https://github.com/CompVis/taming-transformers/blob/master/taming/modules/diffusionmodules/model.py#L34
+"""
+import math
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+    ):
+        super().__init__()
+        conv = nn.Conv2d(in_channels, in_channels * 4, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            Rearrange("b (c p1 p2) h w -> b c (h p1) (w p2)", p1=2, p2=2),
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, "o ... -> (o 4) ...")
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        out = self.net(x)
+        return out
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def unpack_time(t, batch):
+    _, c, w, h = t.size()
+    out = torch.reshape(t, [batch, -1, c, w, h])
+    out = rearrange(out, "b t c h w -> b c t h w")
+    return out
+def pack_time(t):
+    out = rearrange(t, "b c t h w -> b t c h w")
+    _, _, c, w, h = out.size()
+    return torch.reshape(out, [-1, c, w, h])
+class TimeDownsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        kernel_size=3,
+    ):
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim
+        self.time_causal_padding = (kernel_size - 1, 0)
+        self.conv = nn.Conv1d(dim, dim_out, kernel_size, stride=2)
+    def forward(self, x):
+        x = rearrange(x, "b c t h w -> b h w c t")
+        b, h, w, c, t = x.size()
+        x = torch.reshape(x, [-1, c, t])
+        x = F.pad(x, self.time_causal_padding)
+        out = self.conv(x)
+        out = torch.reshape(out, [b, h, w, c, t])
+        out = rearrange(out, "b h w c t -> b c t h w")
+        out = rearrange(out, "b h w c t -> b c t h w")
+        return out
+class TimeUpsample2x(nn.Module):
+    def __init__(self, dim, dim_out=None):
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim
+        conv = nn.Conv1d(dim, dim_out * 2, 1)
+        self.net = nn.Sequential(
+            nn.SiLU(), conv, Rearrange("b (c p) t -> b c (t p)", p=2)
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, t = conv.weight.shape
+        conv_weight = torch.empty(o // 2, i, t)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, "o ... -> (o 2) ...")
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        x = rearrange(x, "b c t h w -> b h w c t")
+        b, h, w, c, t = x.size()
+        x = torch.reshape(x, [-1, c, t])
+        out = self.net(x)
+        out = out[:, :, 1:].contiguous()
+        out = torch.reshape(out, [b, h, w, c, t])
+        out = rearrange(out, "b h w c t -> b c t h w")
+        return out
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class TimeAttention(AttnBlock):
+    def forward(self, x, *args, **kwargs):
+        x = rearrange(x, "b c t h w -> b h w t c")
+        b, h, w, t, c = x.size()
+        x = torch.reshape(x, (-1, t, c))
+        x = super().forward(x, *args, **kwargs)
+        x = torch.reshape(x, [b, h, w, t, c])
+        return rearrange(x, "b h w t c -> b c t h w")
+class Residual(nn.Module):
+    def __init__(self, fn: nn.Module):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        pad_mode="constant",
+        **kwargs
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        dilation = kwargs.pop("dilation", 1)
+        stride = kwargs.pop("stride", 1)
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.time_pad = time_pad
+        self.time_causal_padding = (
+            width_pad,
+            width_pad,
+            height_pad,
+            height_pad,
+            time_pad,
+            0,
+        )
+        stride = (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = nn.Conv3d(
+            chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs
+        )
+    def forward(self, x):
+        pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant"
+        x = F.pad(x, self.time_causal_padding, mode=pad_mode)
+        return self.conv(x)
+def ResnetBlockCausal3D(
+    dim, kernel_size: Union[int, Tuple[int, int, int]], pad_mode: str = "constant"
+):
+    net = nn.Sequential(
+        Normalize(dim),
+        nn.SiLU(),
+        CausalConv3d(dim, dim, kernel_size, pad_mode),
+        Normalize(dim),
+        nn.SiLU(),
+        CausalConv3d(dim, dim, kernel_size, pad_mode),
+    )
+    return Residual(net)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        else:
+            self.temb_proj = None
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h

mcp_servers/fashion_vlm/models/misc.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from omegaconf import OmegaConf
+import torch
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    NewType,
+    Optional,
+    Sized,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+# Tensor dtype
+# for jaxtyping usage, see https://github.com/google/jaxtyping/blob/main/API.md
+from jaxtyping import Bool, Complex, Float, Inexact, Int, Integer, Num, Shaped, UInt
+# Config type
+from omegaconf import DictConfig
+# PyTorch Tensor type
+from torch import Tensor
+# Runtime type checking decorator
+from typeguard import typechecked as typechecker
+def broadcast(tensor, src=0):
+    if not _distributed_available():
+        return tensor
+    else:
+        torch.distributed.broadcast(tensor, src=src)
+        return tensor
+def _distributed_available():
+    return torch.distributed.is_available() and torch.distributed.is_initialized()
+def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
+    # added by Xavier -- delete '--local-rank' in multi-nodes training, don't know why there is such a keyword
+    if '--local-rank' in cfg:
+        del cfg['--local-rank']
+    # added by Xavier -- delete '--local-rank' in multi-nodes training, don't know why there is such a keyword
+    scfg = OmegaConf.structured(fields(**cfg))
+    return scfg

mcp_servers/fashion_vlm/models/modeling_magvitv2.py ADDED Viewed

	@@ -0,0 +1,440 @@

+from dataclasses import dataclass, field
+import numpy as np
+import torch
+import torch.nn as nn
+from .common_modules import *
+from .modeling_utils import ConfigMixin, ModelMixin, register_to_config
+from .misc import *
+import math
+class Updateable:
+    def do_update_step(
+            self, epoch: int, global_step: int, on_load_weights: bool = False
+    ):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step(
+                    epoch, global_step, on_load_weights=on_load_weights
+                )
+        self.update_step(epoch, global_step, on_load_weights=on_load_weights)
+    def do_update_step_end(self, epoch: int, global_step: int):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step_end(epoch, global_step)
+        self.update_step_end(epoch, global_step)
+    def update_step(self, epoch: int, global_step: int, on_load_weights: bool = False):
+        # override this method to implement custom update logic
+        # if on_load_weights is True, you should be careful doing things related to model evaluations,
+        # as the models and tensors are not guarenteed to be on the same device
+        pass
+    def update_step_end(self, epoch: int, global_step: int):
+        pass
+class VQGANEncoder(ModelMixin, ConfigMixin):
+    @dataclass
+    class Config:
+        ch: int = 128
+        ch_mult: List[int] = field(default_factory=lambda: [1, 2, 2, 4, 4])
+        num_res_blocks: List[int] = field(default_factory=lambda: [4, 3, 4, 3, 4])
+        attn_resolutions: List[int] = field(default_factory=lambda: [5])
+        dropout: float = 0.0
+        in_ch: int = 3
+        out_ch: int = 3
+        resolution: int = 256
+        z_channels: int = 13
+        double_z: bool = False
+    def __init__(self,
+                 ch: int = 128,
+                 ch_mult: List[int] = [1, 2, 2, 4, 4],
+                 num_res_blocks: List[int] = [4, 3, 4, 3, 4],
+                 attn_resolutions: List[int] = [5],
+                 dropout: float = 0.0,
+                 in_ch: int = 3,
+                 out_ch: int = 3,
+                 resolution: int = 256,
+                 z_channels: int = 13,
+                 double_z: bool = False):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_ch = in_ch
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            self.in_ch, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = self.resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = self.ch * in_ch_mult[i_level]
+            block_out = self.ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks[i_level]):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, True)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        # for param in self.parameters():
+        #     broadcast(param, src=0)
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks[i_level]):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.quant_conv(h)
+        return h
+class LFQuantizer(nn.Module):
+    def __init__(self, num_codebook_entry: int = -1,
+                 codebook_dim: int = 13,
+                 beta: float = 0.25,
+                 entropy_multiplier: float = 0.1,
+                 commit_loss_multiplier: float = 0.1, ):
+        super().__init__()
+        self.codebook_size = 2 ** codebook_dim
+        print(
+            f"Look-up free quantizer with codebook size: {self.codebook_size}"
+        )
+        self.e_dim = codebook_dim
+        self.beta = beta
+        indices = torch.arange(self.codebook_size)
+        binary = (
+                         indices.unsqueeze(1)
+                         >> torch.arange(codebook_dim - 1, -1, -1, dtype=torch.long)
+                 ) & 1
+        embedding = binary.float() * 2 - 1
+        self.register_buffer("embedding", embedding)
+        self.register_buffer(
+            "power_vals", 2 ** torch.arange(codebook_dim - 1, -1, -1)
+        )
+        self.commit_loss_multiplier = commit_loss_multiplier
+        self.entropy_multiplier = entropy_multiplier
+    def get_indices(self, z_q):
+        return (
+            (self.power_vals.reshape(1, -1, 1, 1) * (z_q > 0).float())
+            .sum(1, keepdim=True)
+            .long()
+        )
+    def get_codebook_entry(self, indices, shape=None):
+        if shape is None:
+            h, w = int(math.sqrt(indices.shape[-1])), int(math.sqrt(indices.shape[-1]))
+        else:
+            h, w = shape
+        b, _ = indices.shape
+        indices = indices.reshape(-1)
+        z_q = self.embedding[indices]
+        z_q = z_q.view(b, h, w, -1)
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+    def forward(self, z, get_code=False):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vector that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        if get_code:
+            return self.get_codebook_entry(z)
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        ge_zero = (z_flattened > 0).float()
+        ones = torch.ones_like(z_flattened)
+        z_q = ones * ge_zero + -ones * (1 - ge_zero)
+        # preserve gradients
+        z_q = z_flattened + (z_q - z_flattened).detach()
+        # compute entropy loss
+        CatDist = torch.distributions.categorical.Categorical
+        logit = torch.stack(
+            [
+                -(z_flattened - torch.ones_like(z_q)).pow(2),
+                -(z_flattened - torch.ones_like(z_q) * -1).pow(2),
+            ],
+            dim=-1,
+        )
+        cat_dist = CatDist(logits=logit)
+        entropy = cat_dist.entropy().mean()
+        mean_prob = cat_dist.probs.mean(0)
+        mean_entropy = CatDist(probs=mean_prob).entropy().mean()
+        # compute loss for embedding
+        commit_loss = torch.mean(
+            (z_q.detach() - z_flattened) ** 2
+        ) + self.beta * torch.mean((z_q - z_flattened.detach()) ** 2)
+        # reshape back to match original input shape
+        z_q = z_q.view(z.shape)
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return {
+            "z": z_q,
+            "quantizer_loss": commit_loss * self.commit_loss_multiplier,
+            "entropy_loss": (entropy - mean_entropy) * self.entropy_multiplier,
+            "indices": self.get_indices(z_q),
+        }
+class VQGANDecoder(ModelMixin, ConfigMixin):
+    def __init__(self, ch: int = 128,
+                 ch_mult: List[int] = [1, 1, 2, 2, 4],
+                 num_res_blocks: List[int] = [4, 4, 3, 4, 3],
+                 attn_resolutions: List[int] = [5],
+                 dropout: float = 0.0,
+                 in_ch: int = 3,
+                 out_ch: int = 3,
+                 resolution: int = 256,
+                 z_channels: int = 13,
+                 double_z: bool = False):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_ch = in_ch
+        self.give_pre_end = False
+        self.z_channels = z_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = self.resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks[i_level]):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, True)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+        self.post_quant_conv = torch.nn.Conv2d(
+            z_channels, z_channels, 1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        output = dict()
+        z = self.post_quant_conv(z)
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks[i_level]):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        output["output"] = h
+        if self.give_pre_end:
+            return output
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        output["output"] = h
+        return output
+class MAGVITv2(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+            self,
+    ):
+        super().__init__()
+        self.encoder = VQGANEncoder()
+        self.decoder = VQGANDecoder()
+        self.quantize = LFQuantizer()
+    def forward(self, pixel_values, return_loss=False):
+        pass
+    def encode(self, pixel_values, return_loss=False):
+        hidden_states = self.encoder(pixel_values)
+        quantized_states = self.quantize(hidden_states)['z']
+        codebook_indices = self.quantize.get_indices(quantized_states).reshape(pixel_values.shape[0], -1)
+        output = (quantized_states, codebook_indices)
+        return output
+    def get_code(self, pixel_values):
+        hidden_states = self.encoder(pixel_values)
+        codebook_indices = self.quantize.get_indices(self.quantize(hidden_states)['z']).reshape(pixel_values.shape[0], -1)
+        return codebook_indices
+    def decode_code(self, codebook_indices, shape=None):
+        z_q = self.quantize.get_codebook_entry(codebook_indices, shape=shape)
+        reconstructed_pixel_values = self.decoder(z_q)["output"]
+        return reconstructed_pixel_values
+if __name__ == '__main__':
+    encoder = VQGANEncoder()
+    import ipdb
+    ipdb.set_trace()
+    print()

mcp_servers/fashion_vlm/models/modeling_showo.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# coding=utf-8
+# Copyright 2024 NUS Show Lab, HuggingFace.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from transformers import AutoConfig
+from .modeling_utils import ConfigMixin, ModelMixin, register_to_config
+from .sampling import cosine_schedule, mask_by_random_topk
+from .phi import PhiForCausalLM
+class Showo(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+            self,
+            w_clip_vit,
+            vocab_size,
+            llm_vocab_size,
+            llm_model_path='',
+            codebook_size=8192,
+            num_vq_tokens=256,
+            load_from_showo=True,
+            **kwargs,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.register_to_config(mask_token_id=vocab_size - 1)
+        if load_from_showo:
+            config = AutoConfig.from_pretrained(llm_model_path)
+            self.showo = PhiForCausalLM(config)
+        else:
+            self.showo = PhiForCausalLM.from_pretrained(llm_model_path, attn_implementation='sdpa')
+        self.showo.resize_token_embeddings(self.vocab_size)
+        self.output_size = self.vocab_size
+        if self.w_clip_vit:
+            self.mm_projector = torch.nn.Sequential(
+                torch.nn.Linear(1024, 2048),
+                torch.nn.GELU(),
+                torch.nn.Linear(2048, 2048)
+            )
+    def forward(
+            self,
+            input_ids,
+            input_embeddings=None,
+            attention_mask=None,
+            labels=None,
+            label_smoothing=0.0,
+            batch_size_t2i=0,
+            batch_size_lm=0,
+            batch_size_mmu=0,
+            max_seq_length=128,
+            labels_mask_text=None,
+            labels_mask_image=None,
+            **kwargs,
+    ):
+        if input_embeddings is None:
+            logits = self.showo(input_ids=input_ids, attention_mask=attention_mask)['logits']
+        else:
+            logits = self.showo(inputs_embeds=input_embeddings, attention_mask=attention_mask)['logits']
+        if labels is not None:
+            # 1. Mask token prediction (discrete diffusion) for image generation
+            # Note that, max_seq_length indicates the maximum number of text tokens, maybe a bit confused.
+            loss_t2i = F.cross_entropy(
+                logits[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1, self.output_size),
+                labels[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1), ignore_index=-100,
+            )
+            # 2. Next token prediction for language modeling
+            loss_lm = F.cross_entropy(
+                logits[batch_size_t2i:batch_size_t2i + batch_size_lm, :-1].contiguous().view(-1, self.output_size),
+                labels[batch_size_t2i:batch_size_t2i + batch_size_lm, 1:].contiguous().view(-1), ignore_index=-100,
+            )
+            # loss_lm = torch.tensor(0.0, device=logits.device)
+            # 3. Next token prediction for captioning/multimodal understanding
+            loss_mmu = F.cross_entropy(
+                logits[-batch_size_mmu:, :-1].contiguous().view(-1, self.output_size),
+                labels[-batch_size_mmu:, 1:].contiguous().view(-1), ignore_index=-100,
+            )
+            return logits, loss_t2i, loss_lm, loss_mmu
+        return logits
+    def t2i_generate(
+            self,
+            input_ids: torch.LongTensor = None,
+            uncond_input_ids: torch.LongTensor = None,
+            attention_mask=None,
+            temperature=1.0,
+            timesteps=18,  # ideal number of steps is 18 in maskgit paper
+            guidance_scale=0,
+            noise_schedule=cosine_schedule,
+            generator: torch.Generator = None,
+    ):
+        # begin with all image token ids masked
+        mask_token_id = self.config.mask_token_id
+        num_vq_tokens = self.config.num_vq_tokens
+        num_new_special_tokens = 10
+        llm_vocab_size = self.config.llm_vocab_size
+        max_seq_length = 381
+        input_ids_minus_lm_vocab_size = input_ids[:, -(num_vq_tokens + 1):-1].clone()
+        input_ids_minus_lm_vocab_size = torch.where(
+            input_ids_minus_lm_vocab_size == mask_token_id,
+            mask_token_id,
+            input_ids_minus_lm_vocab_size - llm_vocab_size - num_new_special_tokens
+        )
+        # for classifier-free guidance
+        if uncond_input_ids is not None:
+            uncond_prefix = uncond_input_ids[:, :max_seq_length + 1]
+        for step in range(timesteps):
+            if uncond_input_ids is not None and guidance_scale > 0:
+                uncond_input_ids = torch.cat(
+                    [uncond_prefix, input_ids[:, max_seq_length + 1:]], dim=1)
+                model_input = torch.cat([input_ids, uncond_input_ids])
+                cond_logits, uncond_logits = self(model_input, attention_mask=attention_mask).chunk(2)
+                # logits = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+                # it seems that muse has a different cfg setting
+                logits = (1 + guidance_scale) * cond_logits - guidance_scale * uncond_logits
+                logits = logits[:, -(num_vq_tokens + 1):-1, llm_vocab_size + num_new_special_tokens:-1]
+            else:
+                logits = self(input_ids, attention_mask=attention_mask)
+                logits = logits[:, -(num_vq_tokens + 1):-1, llm_vocab_size + num_new_special_tokens:-1]
+            probs = logits.softmax(dim=-1)
+            sampled = probs.reshape(-1, logits.size(-1))
+            sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*logits.shape[:-1])
+            unknown_map = input_ids_minus_lm_vocab_size == mask_token_id
+            sampled_ids = torch.where(unknown_map, sampled_ids, input_ids_minus_lm_vocab_size)
+            # Defines the mask ratio for the next round. The number to mask out is
+            # determined by mask_ratio * unknown_number_in_the_beginning.
+            ratio = 1.0 * (step + 1) / timesteps
+            mask_ratio = noise_schedule(torch.tensor(ratio))
+            # Computes the probabilities of each selected tokens.
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None])
+            selected_probs = selected_probs.squeeze(-1)
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+            # Gets mask lens for each sample in the batch according to the mask ratio.
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(logits.device)
+            # Keeps at least one of prediction in this round and also masks out at least
+            # one and for the next iteration
+            mask_len = torch.max(
+                torch.tensor([1], device=logits.device), torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            )
+            # Adds noise for randomness
+            temperature = temperature * (1.0 - ratio)
+            masking = mask_by_random_topk(mask_len, selected_probs, temperature, generator=generator)
+            # Masks tokens with lower confidence.
+            input_ids[:, -(num_vq_tokens + 1):-1] = torch.where(masking, mask_token_id,
+                                                          sampled_ids + llm_vocab_size
+                                                          + num_new_special_tokens)
+            input_ids_minus_lm_vocab_size = torch.where(masking, mask_token_id, sampled_ids)
+        return sampled_ids
+    @torch.no_grad()
+    def mmu_generate(self, idx=None, input_embeddings=None, attention_mask=None, max_new_tokens=100, temperature=1.0, top_k=None, eot_token=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        try:
+            device = idx.device
+        except:
+            device = input_embeddings.device
+        result = []
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            # idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            # logits, _ = self(idx_cond)
+            logits = self(idx, input_embeddings=input_embeddings, attention_mask=attention_mask)
+            L = attention_mask.shape[-1]
+            attention_mask = attention_mask.squeeze()
+            attention_mask_a = torch.hstack(
+                [
+                    attention_mask,  # L, L
+                    torch.zeros((L, 1)).to(device) + torch.finfo(logits.dtype).min,
+                ]
+            )
+            attention_mask_b = torch.vstack(
+                [
+                    attention_mask_a,  # L, L+1
+                    torch.hstack([attention_mask[-1, :], torch.tensor([0]).to(device)]).unsqueeze(0),
+                ]
+            )
+            attention_mask = attention_mask_b
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            result.append(idx_next[0][0])
+            # append sampled index to the running sequence and continue
+            if self.config.w_clip_vit:
+                idx_next_embeddings = self.showo.model.embed_tokens(idx_next)
+                input_embeddings = torch.cat([input_embeddings, idx_next_embeddings], dim=1)
+            else:
+                idx = torch.cat((idx, idx_next), dim=1)
+            if eot_token is not None and idx_next.cpu() == eot_token:
+                break
+        return result

mcp_servers/fashion_vlm/models/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,1207 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import itertools
+import json
+import os
+import re
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Tuple, Union
+import safetensors
+import torch
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import Tensor, nn
+from diffusers import __version__
+from diffusers.utils import (
+    FLAX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    WEIGHTS_INDEX_NAME,
+    _add_variant,
+    _get_checkpoint_shard_files,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "pytorch_model.safetensors"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
+from diffusers.utils.hub_utils import (
+    PushToHubMixin,
+    load_or_create_model_card,
+    populate_model_card,
+)
+from diffusers.models.model_loading_utils import (
+    _determine_device_map,
+    _fetch_index_file,
+    _load_state_dict_into_model,
+    load_model_dict_into_meta,
+    load_state_dict,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+logger = logging.get_logger(__name__)
+_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+if is_accelerate_available():
+    import accelerate
+def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+class ModelMixin(torch.nn.Module, PushToHubMixin):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+    _no_split_modules = None
+    def __init__(self):
+        super().__init__()
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_npu_flash_attention(self, valid: bool) -> None:
+        r"""
+        Set the switch for the npu flash attention.
+        """
+        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_npu_flash_attention"):
+                module.set_use_npu_flash_attention(valid)
+            for child in module.children():
+                fn_recursive_set_npu_flash_attention(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_npu_flash_attention(module)
+    def enable_npu_flash_attention(self) -> None:
+        r"""
+        Enable npu flash attention from torch_npu
+        """
+        self.set_use_npu_flash_attention(True)
+    def disable_npu_flash_attention(self) -> None:
+        r"""
+        disable npu flash attention from torch_npu
+        """
+        self.set_use_npu_flash_attention(False)
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+        <Tip warning={true}>
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+        </Tip>
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+        Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "10GB",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        weight_name_split = weights_name.split(".")
+        if len(weight_name_split) in [2, 3]:
+            weights_name_pattern = weight_name_split[0] + "{suffix}." + ".".join(weight_name_split[1:])
+        else:
+            raise ValueError(f"Invalid {weights_name} provided.")
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+        # Save the model
+        state_dict = model_to_save.state_dict()
+        # Save the model
+        state_dict_split = split_torch_state_dict_into_shards(
+            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+        )
+        # Clean the folder from a previous save
+        if is_main_process:
+            for filename in os.listdir(save_directory):
+                if filename in state_dict_split.filename_to_tensors.keys():
+                    continue
+                full_filename = os.path.join(save_directory, filename)
+                if not os.path.isfile(full_filename):
+                    continue
+                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                if (
+                    filename.startswith(weights_without_ext)
+                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                ):
+                    os.remove(full_filename)
+        for filename, tensors in state_dict_split.filename_to_tensors.items():
+            shard = {tensor: state_dict[tensor] for tensor in tensors}
+            filepath = os.path.join(save_directory, filename)
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+            else:
+                torch.save(shard, filepath)
+        if state_dict_split.is_sharded:
+            index = {
+                "metadata": state_dict_split.metadata,
+                "weight_map": state_dict_split.tensor_to_filename,
+            }
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+        else:
+            path_to_weights = os.path.join(save_directory, weights_name)
+            logger.info(f"Model weights saved in {path_to_weights}")
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
+            model_card.save(Path(save_directory, "README.md").as_posix())
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+        If you get the error message below, you need to finetune the weights for your downstream task:
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+        if device_map is not None:
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+            elif not low_cpu_mem_usage:
+                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+        if low_cpu_mem_usage:
+            if device_map is not None and not is_torch_version(">=", "1.10"):
+                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # Determine if we're loading from a directory of sharded checkpoints.
+        is_sharded = False
+        index_file = None
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        index_file = _fetch_index_file(
+            is_local=is_local,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder or "",
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+            variant=variant,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            user_agent=user_agent,
+            commit_hash=commit_hash,
+        )
+        if index_file is not None and index_file.is_file():
+            is_sharded = True
+        if is_sharded and from_flax:
+            raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.")
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if is_sharded:
+                sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path,
+                    index_file,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder or "",
+                )
+            elif use_safetensors and not is_sharded:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                    if not allow_pickle:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
+            if model_file is None and not is_sharded:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None and not is_sharded:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+                    if len(unexpected_keys) > 0:
+                        logger.warning(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    force_hook = True
+                    device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
+                    if device_map is None and is_sharded:
+                        # we load the parameters on the cpu
+                        device_map = {"": "cpu"}
+                        force_hook = False
+                    try:
+                        accelerate.load_checkpoint_and_dispatch(
+                            model,
+                            model_file if not is_sharded else index_file,
+                            device_map,
+                            max_memory=max_memory,
+                            offload_folder=offload_folder,
+                            offload_state_dict=offload_state_dict,
+                            dtype=torch_dtype,
+                            force_hooks=force_hook,
+                            strict=True,
+                        )
+                    except AttributeError as e:
+                        # When using accelerate loading, we do not have the ability to load the state
+                        # dict and rename the weight names manually. Additionally, accelerate skips
+                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
+                        # (which look like they should be private variables?), so we can't use the standard hooks
+                        # to rename parameters on load. We need to mimic the original weight names so the correct
+                        # attributes are available. After we have loaded the weights, we convert the deprecated
+                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
+                        # the weights so we don't have to do this again.
+                        if "'Attention' object has no attribute" in str(e):
+                            logger.warning(
+                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
+                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
+                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
+                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
+                                " please also re-upload it or open a PR on the original repository."
+                            )
+                            model._temp_convert_self_to_deprecated_attention_blocks()
+                            accelerate.load_checkpoint_and_dispatch(
+                                model,
+                                model_file if not is_sharded else index_file,
+                                device_map,
+                                max_memory=max_memory,
+                                offload_folder=offload_folder,
+                                offload_state_dict=offload_state_dict,
+                                dtype=torch_dtype,
+                                force_hooks=force_hook,
+                                strict=True,
+                            )
+                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
+                        else:
+                            raise e
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict: OrderedDict,
+        resolved_archive_file,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+        return expected_modules, optional_parameters
+    # Adapted from `transformers` modeling_utils.py
+    def _get_no_split_modules(self, device_map: str):
+        """
+        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+        get the underlying `_no_split_modules`.
+        Args:
+            device_map (`str`):
+                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+        Returns:
+            `List[str]`: List of modules that should not be split
+        """
+        _no_split_modules = set()
+        modules_to_check = [self]
+        while len(modules_to_check) > 0:
+            module = modules_to_check.pop(-1)
+            # if the module does not appear in _no_split_modules, we also check the children
+            if module.__class__.__name__ not in _no_split_modules:
+                if isinstance(module, ModelMixin):
+                    if module._no_split_modules is None:
+                        raise ValueError(
+                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+                            "class needs to implement the `_no_split_modules` attribute."
+                        )
+                    else:
+                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
+                modules_to_check += list(module.children())
+        return list(_no_split_modules)
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+        Returns:
+            `int`: The number of parameters.
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+        recursive_find_attn_block("", self)
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
+class LegacyModelMixin(ModelMixin):
+    r"""
+    A subclass of `ModelMixin` to resolve class mapping from legacy classes (like `Transformer2DModel`) to more
+    pipeline-specific classes (like `DiTTransformer2DModel`).
+    """
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        # To prevent dependency import problem.
+        from diffusers.models.model_loading_utils import _fetch_remapped_cls_from_config
+        # Create a copy of the kwargs so that we don't mess with the keyword arguments in the downstream calls.
+        kwargs_copy = kwargs.copy()
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, _, _ = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # resolve remapping
+        remapped_class = _fetch_remapped_cls_from_config(config, cls)
+        return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)

mcp_servers/fashion_vlm/models/phi.py ADDED Viewed

	@@ -0,0 +1,1489 @@

+# coding=utf-8
+# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Phi model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    get_torch_version,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.phi.configuration_phi import PhiConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "microsoft/phi-1"
+_CONFIG_FOR_DOC = "PhiConfig"
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Phi
+class PhiRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+# Copied from transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding with Falcon->Phi
+class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
+    """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding with Falcon->Phi
+class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
+    """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Phi
+class PhiMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class PhiAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)
+        self.qk_layernorm = config.qk_layernorm
+        if self.qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(
+                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
+            )
+            self.k_layernorm = nn.LayerNorm(
+                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
+            )
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = PhiRotaryEmbedding(
+                int(self.partial_rotary_factor * self.head_dim),
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = PhiLinearScalingRotaryEmbedding(
+                    int(self.partial_rotary_factor * self.head_dim),
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding(
+                    int(self.partial_rotary_factor * self.head_dim),
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if self.qk_layernorm:
+            query_states = self.q_layernorm(query_states)
+            key_states = self.k_layernorm(key_states)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
+        attn_weights = torch.matmul(
+            query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.dense(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class PhiFlashAttention2(PhiAttention):
+    """
+    Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # PhiFlashAttention2 attention does not support output_attentions
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if self.qk_layernorm:
+            query_states = self.q_layernorm(query_states)
+            key_states = self.k_layernorm(key_states)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_dropout = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=attn_dropout, softmax_scale=None
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.dense(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class PhiSdpaAttention(PhiAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+    """
+    SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `PhiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from PhiAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "PhiModel is using PhiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not "
+                "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying "
+                "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can "
+                'be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if self.qk_layernorm:
+            query_states = self.q_layernorm(query_states)
+            key_states = self.k_layernorm(key_states)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.dense(attn_output)
+        return attn_output, None, past_key_value
+PHI_ATTENTION_CLASSES = {
+    "eager": PhiAttention,
+    "flash_attention_2": PhiFlashAttention2,
+    "sdpa": PhiSdpaAttention,
+}
+class PhiDecoderLayer(nn.Module):
+    def __init__(self, config: PhiConfig, layer_idx: int):
+        super().__init__()
+        self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = PhiMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        attn_outputs = self.resid_dropout(attn_outputs)
+        feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+PHI_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PhiConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Phi Model outputting raw hidden-states without any specific head on top.",
+    PHI_START_DOCSTRING,
+)
+class PhiPreTrainedModel(PreTrainedModel):
+    config_class = PhiConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PhiDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+PHI_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Phi Model outputting raw hidden-states without any specific head on top.",
+    PHI_START_DOCSTRING,
+)
+class PhiModel(PhiPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`]
+    Args:
+        config: PhiConfig
+    """
+    def __init__(self, config: PhiConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        print("attention implementation: ", config._attn_implementation)
+        self.layers = nn.ModuleList(
+            [PhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        past_key_values_length = 0
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_dropout(inputs_embeds)
+        # commented by Xavier
+        # Attention mask.
+        # if self._use_flash_attention_2:
+        #     # 2d mask is passed through the layers
+        #     attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        # elif self._use_sdpa and not output_attentions:
+        #     attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+        #         attention_mask,
+        #         (batch_size, seq_length),
+        #         inputs_embeds,
+        #         past_key_values_length,
+        #     )
+        # else:
+        #     # 4d mask is passed through the layers
+        #     attention_mask = _prepare_4d_causal_attention_mask(
+        #         attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        #     )
+        # commented by Xavier
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.final_layernorm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class PhiForCausalLM(PhiPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        config.qk_layernorm = True
+        config.use_cache = False
+        self.model = PhiModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, PhiForCausalLM
+        >>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The PhiModel with a sequence classification head on top (linear layer).
+    [`PhiForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHI_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->PHI,Llama->Phi with self.transformer->self.model, transformer_outputs->model_outputs
+class PhiForSequenceClassification(PhiPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = PhiModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + model_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    PhiModel with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PHI_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with MPT->PHI,Mpt->Phi,self.transformer->self.model,transformer_outputs->model_outputs
+class PhiForTokenClassification(PhiPreTrainedModel):
+    def __init__(self, config: PhiConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = PhiModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+        if not return_dict:
+            output = (logits,) + model_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )

mcp_servers/fashion_vlm/models/sampling.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Adapted from https://github.com/lucidrains/muse-maskgit-pytorch
+import math
+from functools import partial
+import torch
+import torch.nn.functional as F
+def log(t, eps=1e-20):
+    return torch.log(t.clamp(min=eps))
+def gumbel_noise(t, generator=None):
+    noise = torch.zeros_like(t).uniform_(0, 1, generator=generator)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature=1.0, dim=-1, generator=None):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t, generator=generator)).argmax(dim=dim)
+def top_k(logits, thres=0.9):
+    k = math.ceil((1 - thres) * logits.shape[-1])
+    val, ind = logits.topk(k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(2, ind, val)
+    return probs
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    confidence = log(probs) + temperature * gumbel_noise(probs, generator=generator)
+    sorted_confidence = torch.sort(confidence, dim=-1).values
+    cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
+    masking = confidence < cut_off
+    return masking
+def cosine_schedule(t):
+    return torch.cos(t * math.pi * 0.5)
+def linear_schedule(t):
+    mask_ratio = 1 - t
+    mask_ratio = mask_ratio.clamp(min=1e-6, max=1.0)
+    return mask_ratio
+def pow(t, method):
+    exponent = float(method.replace("pow", ""))
+    mask_ratio = 1.0 - t**exponent
+    mask_ratio = mask_ratio.clamp(min=1e-6, max=1.0)
+    return mask_ratio
+def sigmoid_schedule(t, start=-3, end=3, tau=1.0, clip_min=1e-6):
+    for item in [t, start, end, tau]:
+        item = torch.tensor(item) if not torch.is_tensor(item) else item
+    # A gamma function based on sigmoid function.
+    v_start = torch.sigmoid(torch.tensor(start / tau))
+    v_end = torch.sigmoid(torch.tensor(end / tau))
+    output = torch.sigmoid((t * (end - start) + start) / tau)
+    output = (v_end - output) / (v_end - v_start)
+    return torch.clip(output, clip_min, 1.0)
+def get_mask_chedule(method, **schedule_kwargs):
+    if method == "cosine":
+        return cosine_schedule
+    elif method == "linear":
+        return linear_schedule
+    elif "pow" in method:
+        return partial(pow, method=method)
+    elif method == "sigmoid":
+        return partial(sigmoid_schedule, **schedule_kwargs)
+    else:
+        raise ValueError("Unknown schedule method: {}".format(method))
+def top_k_top_p_filtering(
+    logits: torch.Tensor,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+) -> torch.Tensor:
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits

mcp_servers/fashion_vlm/prompting_utils.py ADDED Viewed

	@@ -0,0 +1,628 @@

+# coding=utf-8
+# Copyright 2024 NUS Show Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+# TODO - SHOULD BE FURTHER IMPROVED.
+class UniversalPrompting():
+    def __init__(self, text_tokenizer,
+                 special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
+                 max_text_len=8000, max_seq_len=377, ignore_id=-100, cond_dropout_prob=0.1):
+        """
+        :param text_tokenizer: original text tokenizer
+        """
+        self.text_tokenizer = text_tokenizer
+        self.text_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.text_tokenizer.add_tokens(list(special_tokens))
+        self.sptids_dict = {token: torch.tensor(self.text_tokenizer.convert_tokens_to_ids([token])) for token in
+                            special_tokens}
+        self.sptids_dict['<|sot|>'] = torch.tensor([self.text_tokenizer.bos_token_id])
+        self.sptids_dict['<|eot|>'] = torch.tensor([self.text_tokenizer.eos_token_id])
+        self.sptids_dict['<|pad|>'] = torch.tensor([self.text_tokenizer.pad_token_id])
+        # plus 1 because at this time we add a task token before
+        self.max_text_len = max_text_len + 1
+        self.pad_id = self.text_tokenizer.convert_tokens_to_ids('[PAD]')
+        self.ignore_id = ignore_id
+        self.cond_dropout_prob = cond_dropout_prob
+    def t2i_prompt(self, text_ids, image_ids, labels):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        probs = torch.rand(len(text_ids))
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            # randomly dropout text condition
+            if probs[i] < self.cond_dropout_prob:
+                temp_ids = [int(self.sptids_dict['<|t2i|>']), self.text_tokenizer.bos_token_id, self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * (len(temp_ids) + image_ids.shape[-1] + 3)
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                # should we predict text tokens when doing image reconstruction?
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                labels[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def t2i_gen_prompt(self, text_ids, image_ids):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * len(temp_ids)
+            else:
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * len(temp_ids)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0)
+    # language modeling
+    def lm_prompt(self, text_ids, max_seq_len):
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_seq_len >= len(temp_ids):
+                temp_labels_ids = temp_ids + [self.ignore_id] * (max_seq_len - len(temp_ids))
+                temp_ids = temp_ids + [self.pad_id] * (max_seq_len - len(temp_ids))
+                temp_masks = [1] * len(temp_ids) + [0] * (max_seq_len - len(temp_ids))
+            else:
+                # In language modeling, we only process text tokens. We do not add the eos token if the text length
+                # exceeds the max sequence length
+                temp_labels_ids = temp_ids[:max_seq_len]
+                temp_ids = temp_ids[:max_seq_len]
+                temp_masks = [1] * len(temp_ids)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.tensor(temp_ids)
+            temp_masks = torch.tensor(temp_masks)
+            temp_labels_ids = torch.tensor(temp_labels_ids)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_labels_ids.unsqueeze(0))
+        # input_ids, masks, labels
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def mmu_prompt(self, image_ids, text_ids):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        max_text_len = self.max_text_len - 1
+        for i in range(len(text_ids)):
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            # for empty list []
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_text_len >= len(temp_ids):
+                # minus 1 because task token was prepended to the former image tokens
+                temp_ids = temp_ids + [self.pad_id] * (max_text_len - len(temp_ids))
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3) + [0] * (max_text_len - len(temp_ids))
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                torch.tensor([self.ignore_id]).to(device),
+                torch.tensor([self.ignore_id]).to(device),
+                torch.ones_like(image_ids[i]) * self.ignore_id,
+                torch.tensor([self.ignore_id]).to(device),
+                torch.tensor(temp_ids).to(device),
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            temp_ids = torch.cat([
+                self.sptids_dict['<|mmu|>'].to(device),  # task token
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device),
+                torch.tensor(temp_ids).to(device),
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def t2v_prompt(self, text_ids, image_ids, labels):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        probs = torch.rand(len(text_ids))
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = [int(self.sptids_dict['<|t2v|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            # randomly dropout text condition
+            if probs[i] < self.cond_dropout_prob:
+                temp_ids = [int(self.sptids_dict['<|t2v|>']), self.text_tokenizer.bos_token_id,
+                            self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * (len(temp_ids) + image_ids.shape[-1] + 3)
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                # should we predict text tokens when doing image reconstruction?
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|sov|>'].to(device),
+                labels[i],
+                self.sptids_dict['<|eov|>'].to(device)
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|sov|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eov|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def t2v_gen_prompt(self, text_ids, image_ids):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            temp_ids = [int(self.sptids_dict['<|t2v|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * len(temp_ids)
+            else:
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * len(temp_ids)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|sov|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eov|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0)
+    def i2v_prompt(self, image_ids, video_ids):
+        """
+        :param image_ids:
+        :param video_ids:
+        :return:
+        """
+        pass
+    def lvg_prompt(self, text_ids, image_ids, labels):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        probs = torch.rand(len(text_ids))
+        probs2 = torch.rand(len(text_ids))
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            # randomly dropout text condition
+            if probs[i] < self.cond_dropout_prob:
+                temp_ids = [int(self.sptids_dict['<|t2i|>']), self.text_tokenizer.bos_token_id,
+                            self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * (len(temp_ids) + image_ids.shape[-1] + 3)
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                # should we predict text tokens when doing image reconstruction?
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                labels[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def lvg_gen_prompt(self, text_ids, image_ids):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - len(temp_ids)) + [1] * len(temp_ids)
+            else:
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * len(temp_ids)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0)
+    def mask_prompt(self):
+        pass
+    def __call__(self, input, task, padding=True, config=None):
+        """
+        input (tuple) : data pairs contain text(str), image(tensor), or videos(tensor).
+        task (str) : a flag indicates the current task.
+        """
+        if task == "t2i":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_prompt(text_ids, image_ids, input[2])
+        elif task == "t2v":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2v_prompt(text_ids, image_ids, input[2])
+        elif task == "t2i_plus_lm":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_prompt(text_ids[:config.training.batch_size], image_ids,
+                                                                   input[2])
+            sequence_ids_with_masks_lm = self.lm_prompt(text_ids[config.training.batch_size:], input[3])
+            return sequence_ids_with_masks, sequence_ids_with_masks_lm
+        elif task == "t2i_gen":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_gen_prompt(text_ids, image_ids)
+        elif task == "t2v_gen":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2v_gen_prompt(text_ids, image_ids)
+        elif task == "lm":
+            text_ids = self.text_tokenizer(input[0], truncation=True)['input_ids']  # (B, max_len)
+            sequence_ids_with_masks = self.lm_prompt(text_ids, input[1])
+        elif task == "mmu":
+            image_ids = input[0]
+            text_ids = self.text_tokenizer(input[1])['input_ids']
+            sequence_ids_with_masks = self.mmu_prompt(image_ids, text_ids)
+        elif task == "t2v":
+            text_ids = self.text_tokenizer(input[0]['input_ids'])
+            video_ids = self.vision_tokenizer(input[1])
+            sequence_ids_with_masks = self.t2v_prompt(text_ids, video_ids)
+        elif task == "i2v":
+            image_ids = self.text_tokenizer(input[0])
+            video_ids = self.vision_tokenizer(input[1])
+            sequence_ids_with_masks = self.i2v_prompt(image_ids, video_ids)
+        elif task == "lvg":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.lvg_prompt(text_ids, image_ids, input[2])
+        elif task == "lvg_gen":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.lvg_gen_prompt(text_ids, image_ids)
+        else:
+            raise NotImplementedError
+        return sequence_ids_with_masks
+def create_attention_mask_predict_next(sequence, pad_id=128256, soi_id=128257, eoi_id=128258, rm_pad_in_image=False,
+                                       return_inverse_mask=True):
+    # sequence is expected to be of shape [N, L]
+    N, L = sequence.shape
+    # Masks to identify different types of tokens
+    is_padding = sequence == pad_id
+    is_start_image = sequence == soi_id
+    is_end_image = sequence == eoi_id
+    # Create cumulative sum masks to identify regions of image tokens
+    cumulative_start = torch.cumsum(is_start_image, dim=1)
+    cumulative_end = torch.cumsum(is_end_image, dim=1)
+    in_image_segment = (cumulative_start > cumulative_end) | is_start_image | is_end_image
+    is_text = ~(in_image_segment)
+    causal_mask = torch.tril(torch.ones((L, L), dtype=torch.bool)).to(sequence.device)
+    mask_text = is_text[:, :, None] * causal_mask[None, :, :]
+    is_text_image = is_text | in_image_segment
+    mask_text_image_bi = is_text_image[:, :, None] * is_text_image[:, None, :]
+    if rm_pad_in_image:  # remove padding token in image
+        sid_img = torch.where(sequence == soi_id)[1]
+        for i in range(mask_text_image_bi.shape[0]):
+            pad_end_idx = torch.where(sequence[i] == pad_id)
+            if len(pad_end_idx[0]) != 0:
+                pad_end_idx = pad_end_idx[0][-1]
+                mask_text[i][pad_end_idx + 1:, :pad_end_idx + 1] = 0
+            id_padding = torch.where(is_padding[i] == True)
+            mask_text_image_bi[i][sid_img[i]:, id_padding[0]] = 0
+    mask_text[in_image_segment] = mask_text_image_bi[in_image_segment]
+    # No token attends to padding tokens and padding tokens do not attend to any token
+    if return_inverse_mask:
+        inverted_mask = 1.0 - mask_text.type(sequence.dtype)
+        inverted_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.iinfo(sequence.dtype).min
+        )
+        return inverted_mask.unsqueeze(1)
+    else:
+        return mask_text.unsqueeze(1)
+def create_attention_mask_lvg(sequence, pad_id=128256, soi_id=128257, eoi_id=128258, return_inverse_mask=True):
+    # sequence is expected to be of shape [N, L]
+    N, L = sequence.shape
+    # Masks to identify different types of tokens
+    is_padding = sequence == pad_id
+    mask_text_image_bi = torch.tril(torch.ones(N, L, L), diagonal=0).to(sequence.device)
+    sid_img = torch.where(sequence == soi_id)[1].reshape(mask_text_image_bi.shape[0], -1)[:, 0]
+    sid_img_for_bi = torch.where(sequence == soi_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    eid_img_for_bi = torch.where(sequence == eoi_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    for i in range(N):
+        id_padding = torch.where(is_padding[i] == True)
+        mask_text_image_bi[i][sid_img[i]:, id_padding[0]] = 0
+        for j in range(sid_img_for_bi.shape[-1]):
+            mask_text_image_bi[i][sid_img_for_bi[i, j]:eid_img_for_bi[i, j] + 1,
+            sid_img_for_bi[i, j]:eid_img_for_bi[i, j] + 1] = 1
+    # No token attends to padding tokens and padding tokens do not attend to any token
+    if return_inverse_mask:
+        inverted_mask = 1.0 - mask_text_image_bi.type(sequence.dtype)
+        inverted_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.iinfo(sequence.dtype).min
+        )
+        return inverted_mask.unsqueeze(1)
+    else:
+        return mask_text_image_bi.unsqueeze(1)
+# texts without attending image regions
+def create_attention_mask_lvg_v2(sequence, pad_id=128256, soi_id=128257, eoi_id=128258, sot_id=1000, eot_id=1001, return_inverse_mask=True):
+    # sequence is expected to be of shape [N, L]
+    N, L = sequence.shape
+    # Masks to identify different types of tokens
+    is_padding = sequence == pad_id
+    # is_text = torch.where(sequence < 2000, True, False)
+    is_text = torch.where(sequence < pad_id, True, False)
+    mask_text_image_bi = torch.tril(torch.ones(N, L, L), diagonal=0).to(sequence.device).int()
+    sid_text_for_bi = torch.where(sequence == sot_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    eid_text_for_bi = torch.where(sequence == eot_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    # import ipdb
+    # ipdb.set_trace()
+    if sot_id == eot_id:
+        if sid_text_for_bi.shape[-1] % 2 != 0:
+            sid_text_for_bi = sid_text_for_bi[:, :-1]
+            eid_text_for_bi = eid_text_for_bi[:, :-1]
+        select_idx = [i for i in range(0, sid_text_for_bi.shape[1], 2)]
+        sid_text_for_bi = sid_text_for_bi[:, select_idx]
+        select_idx = [i+1 for i in range(0, eid_text_for_bi.shape[1], 2)]
+        eid_text_for_bi = eid_text_for_bi[:, select_idx]
+    sid_img_for_bi = torch.where(sequence == soi_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    eid_img_for_bi = torch.where(sequence == eoi_id)[1].reshape(mask_text_image_bi.shape[0], -1)
+    all_zeros = torch.zeros_like(mask_text_image_bi).int()
+    for i in range(N):
+        all_zeros[i, :, is_text[i]] = 1
+        for j in range(sid_text_for_bi.shape[-1]):
+            all_zeros[i][is_text[i], sid_text_for_bi[i, j]:eid_text_for_bi[i, j]+1] = 1
+            all_zeros[i][~is_text[i], sid_text_for_bi[i, j]:eid_text_for_bi[i, j]+1] = 1
+        for j in range(sid_img_for_bi.shape[-1]):
+            all_zeros[i][~is_text[i], sid_img_for_bi[i, j]:eid_img_for_bi[i, j]+1] = 1
+    mask_text_image_bi = mask_text_image_bi * all_zeros
+    sid_img = torch.where(sequence == soi_id)[1].reshape(mask_text_image_bi.shape[0], -1)[:, 0]
+    for i in range(N):
+        id_padding = torch.where(is_padding[i] == True)
+        mask_text_image_bi[i][sid_img[i]:, id_padding[0]] = 0
+        for j in range(sid_img_for_bi.shape[-1]):
+            mask_text_image_bi[i][sid_img_for_bi[i, j]:eid_img_for_bi[i, j]+1, sid_img_for_bi[i, j]:eid_img_for_bi[i, j]+1] = 1
+    mask_text_image_bi[:, :, 0] = 1
+    # No token attends to padding tokens and padding tokens do not attend to any token
+    if return_inverse_mask:
+        inverted_mask = 1.0 - mask_text_image_bi.type(sequence.dtype)
+        inverted_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.iinfo(sequence.dtype).min
+        )
+        return inverted_mask.unsqueeze(1)
+    else:
+        return mask_text_image_bi.unsqueeze(1)
+def create_attention_mask_for_mmu(sequence, eoi_id=128258, return_inverse_mask=True):
+    N, L = sequence.shape
+    causal_mask = torch.tril(torch.ones((N, 1, L, L), dtype=torch.bool)).to(sequence.device)
+    eoi_image = torch.where(sequence == eoi_id)[1]
+    causal_mask[:, :, :, :eoi_image[0] + 1] = 1
+    if return_inverse_mask:
+        inverted_mask = 1.0 - causal_mask.type(sequence.dtype)
+        inverted_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.iinfo(sequence.dtype).min
+        )
+        return inverted_mask
+    else:
+        return causal_mask
+def create_attention_mask_for_mmu_vit(
+        sequence,
+        return_inverse_mask=True,
+        system_prompt_len=0
+):
+    N, L, H = sequence.shape
+    causal_mask = torch.tril(torch.ones((N, 1, L, L), dtype=torch.bool)).to(sequence.device)
+    index = 1 + system_prompt_len + 1 + 576
+    # PART OF SYSTEM PROMPT SHOULD BE CAUSAL ALSO
+    # causal_mask[:, :, :, :index] = 1
+    causal_mask[:, :, :, 1+system_prompt_len+1:index] = 1  # 把image token对应的列attention全部置为1
+    if return_inverse_mask:
+        inverted_mask = 1.0 - causal_mask.type(torch.int64)
+        inverted_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.iinfo(torch.int64).min
+        )
+        return inverted_mask
+    else:
+        return causal_mask
+if __name__ == '__main__':
+    pass

mcp_servers/product_user_database.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# mcp_server/product_user_database.py
+import os
+import pickle
+import numpy as np
+from typing import Dict, Any, List
+from dotenv import load_dotenv
+from tqdm import tqdm
+from itertools import combinations
+from scipy import sparse
+from sklearn.metrics.pairwise import cosine_similarity
+from mcp.server.fastmcp import FastMCP
+from mcp.server.sse import SseServerTransport
+from starlette.applications import Starlette
+from starlette.routing import Route, Mount
+import uvicorn
+import pandas as pd
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from openai import AsyncOpenAI
+# Load environment variables
+load_dotenv()
+FASHION_DATA_ROOT = os.getenv("FASHION_DATA_ROOT", "/mnt/d/PostDoc/fifth paper/code/FashionVLM/datasets/FashionRec")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
+openai = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE)
+###################################
+#########Loading Model#############
+###################################
+# Load CLIP model and processor
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", local_files_only=True)
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", local_files_only=True)
+clip_model.eval()
+# Load item metadata
+items_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/items_lite.parquet").set_index("item_id")
+outfits_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/outfits_lite.parquet").set_index("outfit_id")
+users_df = pd.read_parquet(f"{FASHION_DATA_ROOT}/meta/users_lite.parquet").set_index("user_id")
+image_paths = items_df["path"].to_dict()
+class InteractionDataManager:
+    def __init__(self, users_df, outfits_df, items_df):
+        """
+        初始化类，加载数据并设置基本参数
+        参数:
+        - users_file: 用户数据文件路径 (parquet)
+        - outfits_file: Outfit 数据文件路径 (parquet)
+        - items_file: 单品数据文件路径 (parquet)
+        """
+        self.users_df = users_df
+        self.outfits_df = outfits_df
+        self.items_df = items_df
+        # 创建映射
+        self.item_id_to_index = {item_id: index for index, item_id in enumerate(self.items_df.index)}
+        self.index_to_item_id = {index: item_id for index, item_id in enumerate(self.items_df.index)}
+        self.user_id_to_index = {user_id: index for index, user_id in enumerate(self.users_df.index)}
+        self.index_to_user_id = {index: user_id for index, user_id in enumerate(self.users_df.index)}
+        self.outfit_ids_dict = self.outfits_df['item_ids'].to_dict()  # get outfit's item ids from outfit id
+        self.item_category_dict = self.items_df['category'].to_dict()  # get item's category from item id
+        self.item_subcategory_dict = self.items_df['subcategory'].to_dict()  # get item's subcategory from item id
+        self.n_items = len(self.items_df)
+        self.n_users = len(self.users_df)
+        self.user_outfit_pairs = []
+        outfit_set = set(self.outfits_df.index)
+        for uid, user in self.users_df.iterrows():
+            oids = user.outfit_ids.split(",")
+            self.user_outfit_pairs.extend([(uid, oid) for oid in oids if oid in outfit_set])
+        # 预处理类别到物品ID的映射（使用groupby）
+        self.subcategory_to_items = self.items_df.groupby('subcategory').apply(lambda x: set(x.index)).to_dict()
+        # 预处理类别到物品索引的映射（优化查找效率）
+        self.subcategory_to_indices = {}
+        for subcategory, item_ids in self.subcategory_to_items.items():
+            self.subcategory_to_indices[subcategory] = set([self.item_id_to_index[item_id]
+                                                            for item_id in item_ids
+                                                            if item_id in self.item_id_to_index])
+        item_interaction_matrix_path = f'{FASHION_DATA_ROOT}/data/personalized_recommendation/temp_matrix/item_matrix.npz'
+        try:
+            self.load_matrix('item', item_interaction_matrix_path)
+        except FileNotFoundError:
+            self.build_item_interaction_matrix()
+            self.save_matrix('item', item_interaction_matrix_path)
+        user_item_interaction_matrix_path = f'{FASHION_DATA_ROOT}/data/personalized_recommendation/temp_matrix/user_item_matrix.npz'
+        try:
+            self.load_matrix('user_item', user_item_interaction_matrix_path)
+        except FileNotFoundError:
+            self.build_user_item_interaction_matrix()
+            self.save_matrix('user_item', user_item_interaction_matrix_path)
+        # 加载item clip features
+        with open(f"{FASHION_DATA_ROOT}/meta/clip_features.pkl", "rb") as f:
+            print("Loading Fashion Features...")
+            self.clip_features = pickle.load(f)
+            print("Loading Fashion Features Successfully")
+        # Prepare embeddings and item IDs
+        self.item_ids = list(self.clip_features.keys())
+        self.image_embeddings = np.array([self.clip_features[item_id]["image_embeds"] for item_id in item_ids])
+    def save_matrix(self, matrix_type, filepath):
+        """
+        保存矩阵到文件
+        参数:
+        - matrix_type: 'item' 或 'user_item'，指定保存的矩阵类型
+        - filepath: 保存路径 (例如 'temp/item_matrix.npz')
+        """
+        if matrix_type == 'item':
+            matrix = self.item_interaction_matrix
+        elif matrix_type == 'user_item':
+            matrix = self.user_item_interaction_matrix
+        else:
+            raise ValueError("matrix_type must be 'item' or 'user_item'")
+        if matrix is None:
+            raise ValueError(f"{matrix_type} matrix has not been built yet.")
+        sparse.save_npz(filepath, matrix)
+        print(f"Saved {matrix_type} matrix to {filepath}")
+    def load_matrix(self, matrix_type, filepath):
+        """
+        从文件加载矩阵
+        参数:
+        - matrix_type: 'item' 或 'user_item'，指定加载的矩阵类型
+        - filepath: 加载路径 (例如 'temp/item_matrix.npz')
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File {filepath} does not exist.")
+        matrix = sparse.load_npz(filepath)
+        if matrix_type == 'item':
+            self.item_interaction_matrix = matrix
+        elif matrix_type == 'user_item':
+            self.user_item_interaction_matrix = matrix
+        else:
+            raise ValueError("matrix_type must be 'item' or 'user_item'")
+        print(f"Loaded {matrix_type} matrix from {filepath}")
+        return matrix
+    def build_item_interaction_matrix(self):
+        """构建 Item-Item 交互矩阵"""
+        # 初始化单品交互矩阵
+        self.item_interaction_matrix = sparse.lil_matrix((self.n_items, self.n_items), dtype=int)
+        for index, outfit in tqdm(self.outfits_df.iterrows(), total=len(self.outfits_df)):
+            item_ids = outfit['item_ids'].split(',')
+            # 记录 item 对的共现
+            for item_id1, item_id2 in combinations(item_ids, r=2):
+                if item_id1 in self.item_id_to_index and item_id2 in self.item_id_to_index:
+                    idx1 = self.item_id_to_index[item_id1]
+                    idx2 = self.item_id_to_index[item_id2]
+                    self.item_interaction_matrix[idx1, idx2] += 1
+                    self.item_interaction_matrix[idx2, idx1] += 1  # 无序对称
+        # 转换为 CSR 格式
+        self.item_interaction_matrix = self.item_interaction_matrix.tocsr()
+        return self.item_interaction_matrix
+    def build_user_item_interaction_matrix(self):
+        """构建 User-Item 交互矩阵"""
+        # 初始化用户-单品交互矩阵
+        self.user_item_interaction_matrix = sparse.lil_matrix((self.n_users, self.n_items), dtype=int)
+        for uid, user in tqdm(self.users_df.iterrows(), total=len(self.users_df)):
+            oids = user["outfit_ids"].split(",")
+            outfits = self.outfits_df.loc[self.outfits_df.index.isin(oids)]
+            for oid, outfit in outfits.iterrows():
+                item_ids = outfit['item_ids'].split(',')
+                # 记录 user-item 对的出现
+                for iid in item_ids:
+                    if iid in self.item_id_to_index:
+                        uidx = self.user_id_to_index[uid]
+                        iidx = self.item_id_to_index[iid]
+                        self.user_item_interaction_matrix[uidx, iidx] += 1
+        # 转换为 CSR 格式
+        self.user_item_interaction_matrix = self.user_item_interaction_matrix.tocsr()
+        return self.user_item_interaction_matrix
+    def _process_interactions_for_category(
+            self,
+            matrix,
+            given_id,
+            category_indices,
+            id_to_index
+    ):
+        """
+        处理单个实体与目标类别的交互
+        参数:
+        - matrix: 交互矩阵
+        - given_id: 给定的实体ID（用户或物品）
+        - category_indices: 目标类别的物品索引集合
+        返回:
+        - 交互列表，每个元素为一个包含item_id、interaction_count和score的字典
+        """
+        interactions = []
+        given_index = id_to_index[given_id]
+        row = matrix[given_index]
+        # 提取该行的非零元素
+        row_start = row.indptr[0]
+        row_end = row.indptr[1]
+        col_indices = row.indices[row_start:row_end]
+        data_values = row.data[row_start:row_end]
+        # 筛选出属于目标类别的物品
+        for col_idx, value in zip(col_indices, data_values):
+            # 检查是否为目标类别的物品
+            if col_idx in category_indices:
+                # 获取物品ID
+                output_id = self.index_to_item_id[col_idx]
+                interactions.append({
+                    'item_id': output_id,
+                    'interaction_count': int(value),
+                    'score': 0.0
+                })
+        return interactions
+    def get_item_category_interactions(
+        self,
+        target_category: str,
+        given_ids: List[str],
+        query_type='item',  # item or user
+        top_k=None,
+    ):
+        """
+        获取指定实体（用户或单品）与目标类别的所有交互情况
+        参数:
+        - target_category: 待查询的subcategory
+        - given_ids: List of 目标类别
+        - query_type: 查询的类别， item或user
+        - top_k: 返回交互次数最多的前k个物品, 如果是None直接全部返回
+        返回:
+        - 列表，包含与目标类别的交互统计信息，按交互次数排序
+        """
+        if query_type == 'item':
+            matrix = self.item_interaction_matrix
+            id_to_index = self.item_id_to_index
+        elif query_type == 'user':
+            matrix = self.user_item_interaction_matrix
+            id_to_index = self.user_id_to_index
+        else:
+            print(f'query_type must be either item or user but got {query_type}')
+            return []
+        # 收集所有交互记录
+        all_interactions = []
+        category = target_category
+        category_indices = self.subcategory_to_indices.get(category, set())  # 获取该类别的所有物品索引
+        # 获取该实体的所有交互
+        for given_id in given_ids:
+            interactions = self._process_interactions_for_category(
+                matrix, given_id, category_indices, id_to_index
+            )
+            # 将交互添加到结果列表
+            all_interactions.extend(interactions)
+        # 合并相同物品的交互次数
+        item_interactions = {}
+        for interaction in all_interactions:
+            item_id = interaction['item_id']
+            count = interaction['interaction_count']
+            if item_id in item_interactions:
+                item_interactions[item_id] += count
+            else:
+                item_interactions[item_id] = count
+        # 转换为结果格式
+        merged_interactions = [
+            {'item_id': item_id, 'interaction_count': count, 'score': 0.0}
+            for item_id, count in item_interactions.items()
+        ]
+        # 排序
+        if merged_interactions:
+            merged_interactions.sort(key=lambda x: x['interaction_count'], reverse=True)
+        # 截取top-k
+        if top_k and merged_interactions:
+            merged_interactions = merged_interactions[:top_k]
+        # 存储结果
+        return merged_interactions
+    def rank_by_similarity(self, item_interactions, user_interactions, beta=2.0):
+        """
+        计算用户交互项与商品交互项的相似度并排序
+        """
+        def get_combined_features(feature_dict):
+            return (feature_dict['image_embeds'] + feature_dict['text_embeds']) / 2
+        item_feature_list = []
+        for item in item_interactions:
+            item_id = item['item_id']
+            if item_id not in self.clip_features:
+                raise ValueError(f"Didn't find clip feature of item with id: {item_id}")
+            item_features = get_combined_features(self.clip_features[item_id])
+            item_feature_list.append(item_features)
+        weights = np.array([x['interaction_count'] for x in item_interactions], dtype=np.float32)
+        weights = weights / np.sum(weights)
+        item_feature = np.sum(np.stack(item_feature_list, axis=0) * weights[:, np.newaxis], axis=0).reshape(1, -1)
+        max_count = max((user_item.get('interaction_count', 1) for user_item in user_interactions), default=1)
+        for user_item in user_interactions:
+            user_item_id = user_item['item_id']
+            if user_item_id not in self.clip_features:
+                raise ValueError(f"Didn't find clip feature of item with id: {user_item_id}")
+            user_item_features = get_combined_features(self.clip_features[user_item_id]).reshape(1, -1)
+            similarity = cosine_similarity(user_item_features, item_feature).item()
+            interaction_count = user_item['interaction_count']
+            count_factor = (interaction_count / max_count) * beta + 1
+            user_item['score'] = float(similarity) * count_factor
+        user_interactions.sort(key=lambda x: x.get('score', 0), reverse=True)
+        return user_interactions
+data_manager = InteractionDataManager(users_df, outfits_df, items_df)
+mcp = FastMCP('image-retrieval-server')
+@mcp.tool()
+async def summary_user_history(user_id: str, target_category: str, list_of_items: List[str]) -> str:
+    """Summary user's buying history of specific fashion category given user_id, target_category, list_of_items
+    After we collect all buying history of this user, we will summarize descriptions of these historical items through LLM.
+    So we will return user's preference about target_category in sentences.
+    Args:
+        user_id (str): User id. Will be provided through prompt
+        target_category (str): We care about user's buying history of this specific category.
+        list_of_items: List of item ids for history filtering. Will be provided through prompt
+    """
+    # We need to find the most appropriate item to become the target item
+    # It should have enough relationship with user and other items
+    # Specifically, item_interaction larger than 3, history larger than 10
+    item_interaction_result = data_manager.get_item_category_interactions(
+        target_category, list_of_items, query_type='item'
+    )
+    user_interaction_result = data_manager.get_item_category_interactions(
+        target_category, [user_id], query_type='user'
+    )
+    def get_description(item_id: str) -> str:
+        return data_manager.items_df.loc[item_id].gen_description
+    descriptions_for_summary = []
+    if len(item_interaction_result) == 0:
+        descriptions_for_summary = [get_description(x['item_id']) for x in user_interaction_result]
+    else:
+        if len(user_interaction_result) >= 0:
+            user_interaction_result = data_manager.rank_by_similarity(
+                item_interaction_result,
+                user_interaction_result
+            )
+            descriptions_for_summary = [get_description(x['item_id']) for x in user_interaction_result[:5]]
+    if descriptions_for_summary:
+        user_message = f"Summary user's preference of {target_category} based on following descriptions of fashion items that user brought previously:"
+        for x in descriptions_for_summary:
+            user_message += f"\n{x}"
+        # Get summary using OpenAI API call
+        response = await openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": f"You are a user preference summary assistant. Your response is limited in one sentence, staring at 'I prefer ...'"},
+                {"role": "user", "content": user_message}
+            ],
+            max_tokens=1000,
+        )
+        return response.choices[0].message.content
+    else:
+        return ""
+user_id = "115"
+# 根据类别和given outfit找到这个用户的历史交互
+partial_outfit = ["25479e5dacebbfaed18a7dc4830bd5cd19114486", "becc7b46236e9abb6f6760e7a1569b06bbc236c1",
+                  "180c32b5c8c164f3c632f3e73d6002ccfa6fea57"]
+target_category = "Skirts"
+summary_user_history(user_id, target_category, partial_outfit)
+async def compute_text_embedding(text: str) -> np.ndarray:
+    inputs = clip_processor(text=text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        text_embedding = clip_model.get_text_features(**inputs).numpy()
+    return text_embedding / np.linalg.norm(text_embedding, axis=1, keepdims=True)
+async def find_most_similar_image(text_embedding: np.ndarray) -> Dict[str, Any]:
+    similarities = np.dot(data_manager.image_embeddings, text_embedding.T).flatten()
+    most_similar_idx = np.argmax(similarities)
+    most_similar_item_id = data_manager.item_ids[most_similar_idx]
+    return {
+        "image_path": image_paths[most_similar_item_id],
+        "similarity": float(similarities[most_similar_idx])
+    }
+@mcp.tool()
+async def retrieve_image(text: str) -> Dict[str, Any]:
+    """Search for the most similar fashion image based on a text description.
+    Args:
+        text (str): Text description of the fashion item to search.
+    """
+    print(f"Searching for {text}")
+    text_embedding = await compute_text_embedding(text)
+    return await find_most_similar_image(text_embedding)
+mcp_server = mcp._mcp_server  # 获取内部 Server 对象
+sse_transport = SseServerTransport("/messages/")
+async def handle_sse(request):
+    print("Handling SSE connection")
+    async with sse_transport.connect_sse(request.scope, request.receive, request._send) as streams:
+        read_stream, write_stream = streams
+        await mcp_server.run(
+            read_stream,
+            write_stream,
+            mcp_server.create_initialization_options(),
+        )
+# 定义路由
+routes = [
+    Route("/sse", endpoint=handle_sse),
+    Mount("/messages/", app=sse_transport.handle_post_message),
+]
+# 创建 Starlette 应用
+starlette_app = Starlette(routes=routes)
+if __name__ == "__main__":
+    print("Starting Image Retrieval server with HTTP and SSE...")
+    uvicorn.run(starlette_app, host="0.0.0.0", port=8001)  # 使用 8001 端口，避免与 FashionVLM 冲突

mcp_servers/virtual_try_on.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import pathlib
+import uuid
+from dotenv import load_dotenv
+import PIL
+from google import genai
+from google.genai import types
+from mcp.server.fastmcp import FastMCP
+load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEN_IMG_DIR = os.getenv("GEN_IMG_DIR")
+# os.environ["HTTP_PROXY"] = "http://127.0.0.1:10809"
+# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:10809"
+client = genai.Client(api_key=GEMINI_API_KEY)
+mcp = FastMCP("virtual_try_on")
+async def save_image(response, path):
+    for part in response.candidates[0].content.parts:
+        if part.text is not None:
+            continue
+        elif part.inline_data is not None:
+            mime = part.inline_data.mime_type
+            data = part.inline_data.data
+            pathlib.Path(path).write_bytes(data)
+@mcp.tool()
+async def try_on(image_path: str) -> str:
+    """Generate a virtual try-on image based on image path and return the saved file path.
+    Args:
+        image_path str: Path to the input image file for try-on image generation
+    Returns:
+        str: File path of the generated image
+    """
+    try:
+        print(image_path)
+        response = client.models.generate_content(
+            model="models/gemini-2.0-flash-exp",
+            contents=[
+                "You are a virtual try on tool. Put all the clothes uploaded on a real person and create a picture. Only clothings should be put on, excluding shoes or bags or accessories.",
+                PIL.Image.open(os.path.abspath(image_path))
+            ],
+            config=types.GenerateContentConfig(response_modalities=['Text', 'Image'])
+        )
+        gen_img_filename = f'{GEN_IMG_DIR}/{uuid.uuid4().hex}.png'
+        await save_image(response, gen_img_filename)
+        return os.path.abspath(gen_img_filename)
+    except Exception as e:
+        print(e)
+        return image_path
+def main():
+    print("Started MCP server 'virtual_try_on'...")
+    mcp.run(transport='stdio')
+if __name__ == "__main__":
+    # image_path = "/mnt/d/PostDoc/fifth paper/code/FashionVLM/datasets/FashionRec/data/basic_recommendation/train/temp/0000000_target.jpg"
+    # print(image_path)
+    #
+    # def save_image(response, path):
+    #     for part in response.candidates[0].content.parts:
+    #         if part.inline_data is not None:
+    #             data = part.inline_data.data
+    #             pathlib.Path(path).write_bytes(data)
+    #
+    #
+    # client = genai.Client(api_key="AIzaSyCd3sP-FksEgLB2GCFom8UDvasWJ-glSL4")
+    #
+    # response = client.models.generate_content(
+    #     model="models/gemini-2.0-flash-exp",
+    #     contents=[
+    #         "你是虚拟穿衣工具，把这一套衣服都穿到模特身上，输出一张图片，全身图",
+    #         PIL.Image.open(image_path)
+    #     ],
+    #     config=types.GenerateContentConfig(response_modalities=['Text', 'Image'])
+    # )
+    #
+    # for part in response.candidates[0].content.parts:
+    #     if part.text is not None:
+    #         print(part.text)
+    #
+    # save_image(response, 'edited_image3.png')
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,228 @@

+accelerate==0.21.0
+aiohttp==3.9.5
+aiosignal==1.3.1
+albumentations==0.3.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anykeystore==0.2
+asn1crypto==1.5.1
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==21.2.0
+bidict==0.23.1
+blessed==1.20.0
+boto3==1.34.113
+botocore==1.34.113
+braceexpand==0.1.7
+cachetools==5.3.3
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+clip==0.2.0
+clip-openai==1.0.post20230121
+cmake==3.29.3
+cramjam==2.8.3
+crcmod==1.7
+cryptacular==1.6.2
+cryptography==39.0.2
+cycler==0.12.1
+datasets==2.2.1
+diffusers==0.30.1
+decorator==5.1.1
+decord==0.6.0
+deepspeed==0.14.2
+defusedxml==0.7.1
+Deprecated==1.2.14
+descartes==1.1.0
+dill==0.3.8
+distlib==0.3.8
+distro-info==1.0
+dnspython==2.6.1
+docker-pycreds==0.4.0
+docstring_parser==0.16
+ecdsa==0.19.0
+einops==0.6.0
+exceptiongroup==1.2.1
+executing==2.0.1
+fairscale==0.4.13
+fastparquet==2024.5.0
+ffmpegcv==0.3.13
+filelock==3.14.0
+fire==0.6.0
+fonttools==4.51.0
+frozenlist==1.4.1
+fsspec==2023.6.0
+ftfy==6.2.0
+gitdb==4.0.11
+GitPython==3.1.43
+gpustat==1.1.1
+greenlet==3.0.3
+grpcio==1.64.0
+h11==0.14.0
+hjson==3.1.0
+huggingface-hub==0.23.2
+hupper==1.12.1
+idna==3.7
+imageio==2.34.1
+imgaug==0.2.6
+iniconfig==2.0.0
+ipaddress==1.0.23
+ipdb==0.13.13
+ipython==8.18.1
+jaxtyping==0.2.28
+jedi==0.19.1
+Jinja2==3.1.4
+jmespath==1.0.1
+joblib==1.4.2
+jsonargparse==4.14.1
+jsonlines==4.0.0
+kiwisolver==1.4.5
+kornia==0.7.2
+kornia_rs==0.1.3
+lazy_loader==0.4
+lightning==2.2.3
+lightning-utilities==0.11.2
+lit==18.1.6
+MarkupSafe==2.1.5
+matplotlib==3.5.3
+matplotlib-inline==0.1.7
+miscreant==0.3.0
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.16
+natsort==8.4.0
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.24.4
+nuscenes-devkit==1.1.11
+oauthlib==3.2.2
+omegaconf==2.3.0
+open-clip-torch==2.24.0
+openai-clip
+opencv-python==4.9.0.80
+opencv-python-headless==3.4.18.65
+packaging==22.0
+pandas==1.5.3
+parquet==1.3.1
+parso==0.8.4
+PasteDeploy==3.1.0
+pathlib2==2.3.7.post1
+pathtools==0.1.2
+pbkdf2==1.3
+pexpect==4.9.0
+pillow==10.3.0
+plaster==1.1.2
+plaster-pastedeploy==1.0.1
+platformdirs==4.2.2
+plotly==5.22.0
+pluggy==1.5.0
+ply==3.11
+promise==2.3
+prompt-toolkit==3.0.43
+protobuf==3.20.3
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py==1.11.0
+py-cpuinfo==9.0.0
+py-spy==0.3.14
+pyarrow==11.0.0
+pyarrow-hotfix==0.6
+pyasn1==0.6.0
+pycocotools==2.0.7
+pycparser==2.22
+pycryptodomex==3.20.0
+pycurl==7.43.0.6
+pydantic==1.10.15
+pydantic_core==2.18.3
+Pygments==2.18.0
+PyJWT==2.8.0
+pynvml==11.5.0
+pyope==0.2.2
+pyOpenSSL==23.2.0
+pyparsing==3.1.2
+pyquaternion==0.9.9
+pyramid==2.0.2
+pyramid-mailer==0.15.1
+pytest==6.2.5
+python-consul==1.1.0
+python-dateutil==2.9.0.post0
+python-engineio==4.9.1
+python-etcd==0.4.5
+python-jose==3.3.0
+python-socketio==5.11.2
+python3-openid==3.2.0
+pytorch-extension==0.2
+pytorch-lightning==2.2.3
+pytz==2024.1
+PyYAML==6.0.1
+regex==2024.5.15
+repoze.sendmail==4.4.1
+requests==2.31.0
+requests-oauthlib==2.0.0
+rsa==4.9
+s3transfer==0.10.1
+safetensors==0.4.3
+schedule==1.2.2
+scikit-image==0.22.0
+scikit-learn==1.5.0
+scipy==1.13.1
+sentencepiece==0.2.0
+sentry-sdk==2.3.1
+setproctitle==1.3.3
+Shapely==1.8.5.post1
+shortuuid==1.0.13
+simple-websocket==1.0.0
+six==1.16.0
+smmap==5.0.1
+SQLAlchemy==2.0.30
+stack-data==0.6.3
+sympy==1.12
+taming-transformers-rom1504==0.0.6
+tenacity==8.3.0
+tensorboardX==2.6.2.2
+termcolor==2.4.0
+threadpoolctl==3.5.0
+thriftpy2==0.5.0
+tifffile==2024.5.22
+timm==1.0.3
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+torch==2.2.1
+torch-fidelity==0.3.0
+torchmetrics==1.4.0.post0
+torchvision==0.17.1
+tox==3.28.0
+tqdm==4.66.4
+traitlets==5.14.3
+transaction==4.0
+transformers==4.41.1
+translationstring==1.4
+triton==2.2.0
+typeguard==2.13.3
+typing_extensions==4.12.0
+tzdata==2024.1
+urllib3==1.26.18
+velruse==1.1.1
+venusian==3.1.0
+virtualenv==20.26.2
+wandb==0.17.2
+watchdog==4.0.1
+wcwidth==0.2.13
+webdataset==0.2.86
+WebOb==1.8.7
+websocket-client==1.8.0
+wrapt==1.16.0
+wsproto==1.2.0
+WTForms==3.1.2
+wtforms-recaptcha==0.3.2
+xformers==0.0.25
+xxhash==3.4.1
+yarl==1.9.4
+zope.deprecation==5.0
+zope.interface==6.4.post2
+zope.sqlalchemy==3.1

system_message.py ADDED Viewed

	@@ -0,0 +1,9 @@

+SYSTEM_MESSAGE = """
+You are a fashion assistant. Use weather tool to get weather alert.
+Use the retrieve_image tool when user ask your find product from the database based on user descriptions.
+Use the image_generate tool to generate fashion item image from descriptions. The description must be in English!
+Use the fashion_recommend_without_image tool to generate recommendations when there are no image paths included.
+Use the fashion_recommend tool for those query with uploaded image. Provide recommendation according to the query and uploaded images.
+Use the try_on tool for user's request of try-on uploaded images. Those clothing images paths will be provided in the prompt.
+"""

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from PIL import Image
+from typing import List
+def create_image_grid(image_paths: List[str], output_path: str, grid_size: int = 2) -> None:
+    """
+    将多个图片合并为 4 宫格图片，少于 4 个则留空，确保透明背景被填充为白色，并实现等比缩放。
+    参数:
+    - image_paths: 输入图片路径列表
+    - output_path: 输出 4 宫格图片路径
+    - grid_size: 网格大小（默认 2x2）
+    """
+    images = []
+    target_size = (256, 256)  # 每个格子的目标尺寸
+    for path in image_paths[:4]:
+        try:
+            # 加载图片，保留透明性
+            img = Image.open(path).convert('RGBA')
+            # 如果图片有透明通道，将透明区域填充为白色
+            if img.mode == 'RGBA':
+                background = Image.new('RGBA', img.size, (255, 255, 255, 255))  # 白色背景
+                img = Image.alpha_composite(background, img)
+            # 转换为 RGB 模式
+            img = img.convert('RGB')
+            original_width, original_height = img.size
+            aspect_ratio = original_width / original_height
+            if original_width >= original_height:
+                # 宽度是长边，调整宽度到 256
+                new_width = 256
+                new_height = int(256 / aspect_ratio)
+            else:
+                # 高度是长边，调整高度到 256
+                new_height = 256
+                new_width = int(256 * aspect_ratio)
+            # 等比缩放
+            img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            # img.thumbnail(target_size, Image.Resampling.LANCZOS)  # 使用高质量缩放算法
+            # 创建 256x256 的空白画布（白色背景）
+            canvas = Image.new('RGB', target_size, (255, 255, 255))
+            # 计算居中位置
+            offset_x = (target_size[0] - img.size[0]) // 2
+            offset_y = (target_size[1] - img.size[1]) // 2
+            # 将缩放后的图片居中贴到画布上
+            canvas.paste(img, (offset_x, offset_y))
+            images.append(canvas)
+        except Exception as e:
+            print(f"Error loading image {path}: {e}")
+            images.append(None)
+    # 如果图片不足 4 张，补空
+    while len(images) < 4:
+        images.append(None)
+    # 创建空白画布（512x512，白色背景）
+    grid_image = Image.new('RGB', (512, 512), (255, 255, 255))  # 白色背景
+    # 按 2x2 排列贴图
+    for idx, img in enumerate(images):
+        if img is not None:
+            x = (idx % 2) * 256
+            y = (idx // 2) * 256
+            grid_image.paste(img, (x, y))
+    # 保存图片为 JPG 格式
+    grid_image.save(output_path, quality=95)  # 设置质量为 95，避免过度压缩