Spaces:

maum-ai
/

CANVAS-DEMO

Running on CPU Upgrade

App Files Files Community

lastdefiance20 commited on Dec 4, 2024

Commit

d942a8d

1 Parent(s): 3dcb152

First commit

Browse files

Files changed (11) hide show

app.py +126 -0
canvas.py +295 -0
requirements.txt +8 -0
src/office/0a1f277a93fed629365ac5863c20c64e_frontview_6.0.png +0 -0
src/office/0a1f277a93fed629365ac5863c20c64e_map_6.0.png +0 -0
src/orchard/d578264e1e51cc5b8f0e496ab381cee4_frontview_79.0.png +0 -0
src/orchard/d578264e1e51cc5b8f0e496ab381cee4_map_79.0.png +0 -0
src/road/3cfce98ab33a3dc8d43584d5a7039cf5_frontview_8.75.png +0 -0
src/road/3cfce98ab33a3dc8d43584d5a7039cf5_map_8.75.png +0 -0
src/sidewalk/2d0dde2a98083b7d60b24651d37532dc_frontview_4.0.png +0 -0
src/sidewalk/2d0dde2a98083b7d60b24651d37532dc_map_4.0.png +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import re
+import numpy as np
+import gradio as gr
+import matplotlib.pyplot as plt
+from io import BytesIO
+from PIL import Image
+from canvas import Idefics2Pipeline
+def run_canvas(front_view, map_view, prompt):
+    pipeline = Idefics2Pipeline.from_pretrained(
+        "maum-ai/CANVAS-S"
+    )
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": prompt}]},
+        {
+            "role": "user",
+            "content": [{"type": "image"}, {"type": "image"}],
+        },
+    ]
+    print(front_view)
+    images = [front_view, map_view]
+    pred = pipeline([messages], [images], return_traj=False)
+    pred_action = re.findall(r"<ACTION_(\d+)>", pred[0])
+    pred_action = np.array(pred_action, dtype=np.int64)
+    print(pred_action)
+    pred_action_odom = pipeline.action_tokenizer.detokenize(pred_action).tolist()
+    print(pred_action_odom)
+    # Create a figure and axes
+    fig, axes = plt.subplots(1, 1, figsize=(8, 6))
+    # Scale factor for the arrow
+    scale_factor = 0.2
+    axes.plot(0, 0, marker="o", color="black", markersize=10)
+    axes.invert_xaxis()
+    for i, center in zip(pred_action, pred_action_odom):
+        x, y, yaw = center
+        axes.plot(y, x, marker="^", color="blue")
+        axes.arrow(
+            y,
+            x,
+            np.sin(yaw) * scale_factor,
+            np.cos(yaw) * scale_factor,
+            head_width=scale_factor * 0.3,
+            head_length=scale_factor * 0.3,
+            fc="k",
+            ec="k",
+        )
+        axes.text(y, x, f"{i}", fontsize=10)
+    axes.axis("equal")
+    axes.grid(True)
+    buf = BytesIO()
+    fig.savefig(buf, format="png")
+    buf.seek(0)  # Rewind the buffer to the beginning
+    pil_img = Image.open(buf)
+    return pil_img
+examples = [
+    ["src/office/0a1f277a93fed629365ac5863c20c64e_frontview_6.0.png", "src/office/0a1f277a93fed629365ac5863c20c64e_map_6.0.png", """You are an indoor food-serving robot.
+You must follow these driving instructions:
+1. You must avoid collisions.
+2. You should prioritize reaching the final destination.
+3. You should follow the Trajectory Instruction.
+    a. If the Trajectory Instruction cannot be followed due to any obstacles, you should deviate to bypass the obstacle.
+    b. You should try to evade any identifiable obstacles.
+4. You should maintain a constant driving speed.
+    a. Indoors, you should drive at a speed of 3-4km/h.
+5. You must slow down(2km/h or lower) if a human or obstacle comes within 1.5m radius.
+    a. You must slow down(2km/h or lower) in areas where a human could suddenly appear from a blind spot."""],
+    ["src/orchard/d578264e1e51cc5b8f0e496ab381cee4_frontview_79.0.png", "src/orchard/d578264e1e51cc5b8f0e496ab381cee4_map_79.0.png", """You are an outdoor speed-sprayer robot.
+You must follow these driving instructions:
+1. You must avoid collisions.
+2. You should prioritize reaching the final destination.
+3. You should follow the Trajectory Instruction.
+    a. If the Trajectory Instruction cannot be followed due to any obstacles, you should deviate to bypass the obstacle.
+    b. You should try to evade any identifiable obstacles.
+4. You should maintain a constant driving speed."""],
+    ["src/sidewalk/2d0dde2a98083b7d60b24651d37532dc_frontview_4.0.png", "src/sidewalk/2d0dde2a98083b7d60b24651d37532dc_map_4.0.png", """You are an outdoor last mile delivery robot.
+You must follow these driving instructions:
+1. You must avoid collisions.
+2. You should prioritize reaching the final destination.
+3. You should follow the Trajectory Instruction.
+    a. If the Trajectory Instruction cannot be followed due to any obstacles, you should deviate to bypass the obstacle.
+    b. You should try to evade any identifiable obstacles.
+4. You should maintain a constant driving speed.
+5. You must drive on the sidewalk.
+    a. If you need to cross the road, you must use the crosswalk."""],
+    ["src/road/3cfce98ab33a3dc8d43584d5a7039cf5_frontview_8.75.png", "src/road/3cfce98ab33a3dc8d43584d5a7039cf5_map_8.75.png", """You are an outdoor self-driving robot taxi.
+You must follow these driving instructions:
+1. You must avoid collisions.
+2. You should prioritize reaching the final destination.
+3. You should follow the Trajectory Instruction.
+    a. If the Trajectory Instruction cannot be followed due to any obstacles, you should deviate to bypass the obstacle.
+    b. You should try to evade any identifiable obstacles.
+4. You should maintain a constant driving speed.
+5. You must drive on the road.
+    a. You should drive according to the left-hand-traffic law.
+6. You should slow down before entering intersections, speed bumps, and crosswalks."""],
+]
+demo = gr.Interface(
+    fn = run_canvas,
+    inputs = [
+        gr.Image(label="front_view", type="pil"),
+        gr.Image(label="map_view", type="pil"),
+        gr.Textbox(label="prompt")
+    ],
+    outputs = gr.Image(label="generated waypoint"),
+    title="CANVAS Demo",
+    description="This is the demo of the CANVAS-S model from CANVAS: Commonsense-Aware Navigation System for Intuitive Human-Robot Interaction",
+    examples=examples
+)
+demo.launch()

canvas.py ADDED Viewed

	@@ -0,0 +1,295 @@

+from pydantic import BaseModel
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Idefics2ForConditionalGeneration, Idefics2Processor, PreTrainedModel, ProcessorMixin
+from typing import Optional
+import re
+import logging
+import pickle
+from pathlib import Path
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.cluster import KMeans
+class BaseModelYamlJsonMixin:
+    """
+    BaseModel with helper methods for loading and saving to yaml/json format.
+    """
+    @classmethod
+    def from_yaml(cls, path: Path):
+        with open(path, "r", encoding="utf-8") as f:
+            return cls(**yaml.safe_load(f))
+    def to_yaml(self: BaseModel, path: Path):
+        with open(path, "w", encoding="utf-8") as f:
+            yaml.safe_dump(self.model_dump(), f)
+    @classmethod
+    def from_json(cls, path: Path):
+        with open(path, "r", encoding="utf-8") as f:
+            return cls.model_validate_json(f.read())
+    def to_json(self: BaseModel, path: Path, indent: int = 4, *args, **kwargs):
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(self.model_dump_json(indent=indent, *args, **kwargs))
+class BaseModelWithYamlJsonFromTo(BaseModel, BaseModelYamlJsonMixin):
+    pass
+class Idefics2TrainAdditionalConfig(BaseModel):
+    """
+    num_action_tokens (`int`, defaults to `32`):
+        Number of action tokens to add to the tokenizer vocabulary.
+    do_image_splitting (`bool`, *optional*, defaults to `False`):
+        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That
+        strategy was first introduced in https://arxiv.org/abs/2311.06607.
+    lora_config (`dict`, defaults to recommended config from https://x.com/danielhanchen/status/1791900967472140583):
+        Configuration for the LoRA model. If it is `None`, the model will not use LoRA.
+    """
+    # must be set to extend vocabulary of model + tokenizer
+    num_action_tokens: int = -1  # it will be overwritten by the processor_config.yml
+    # must be set to be used in pipeline
+    num_actions: int = -1  # it will be overwritten by the processor_config.yml
+    do_image_splitting: bool = True
+    freeze_original_vocab: bool = False
+    freeze_vision_model: bool = False
+    freeze_connector: bool = False
+    torch_dtype: str = "bfloat16"
+    lora_config: dict | None = dict(
+        r=256,
+        lora_alpha=512,
+        lora_dropout=0.1,
+        target_modules="all-linear",
+        use_rslora=True,
+        init_lora_weights="gaussian",
+        modules_to_save=["lm_head", "embed_tokens"],
+    )
+    model_name_or_path: str = "HuggingFaceM4/idefics2-8b"
+class KMeansActionTokenizer():
+    def __init__(self, action_count: int = 128):
+        self.action_count = action_count
+        self.kmeans = KMeans(n_clusters=self.action_count, random_state=np.random.RandomState(seed=42))
+    @property
+    def token_count(self):
+        return self.action_count
+    @classmethod
+    def from_pretrained(cls, model_path: str | Path):
+        model_path = Path(model_path)
+        self = cls()
+        with open(model_path / "tokenizer.pkl", "rb") as file:
+            self.kmeans = pickle.load(file)
+        self.action_count = self.kmeans.n_clusters
+        # assert self.action_count == 32
+        return self
+    def save_pretrained(self, model_path: str | Path):
+        model_path = Path(model_path)
+        model_path.mkdir(exist_ok=True)
+        with open(model_path / "tokenizer.pkl", "wb") as file:
+            pickle.dump(self.kmeans, file)
+    def train(self, actions):
+        self.kmeans.fit(actions)
+    def tokenize(self, action, padding=False, max_length=-1, truncation=False):
+        # action: (K, 3) shape, adjusted delta_position and delta_yaw
+        return [i for i in self.kmeans.predict(action)]
+    def detokenize(self, tokens):
+        # Token Check
+        check = np.asarray(tokens)
+        in_valid_range = (0 <= check) & (check < self.action_count)
+        if not in_valid_range.all():
+            logging.warning(f"Invalid tokens occur: {tokens}")
+            # If error occurs, return stop action.
+            return np.asarray([[0.0, 0.0, 0.0] for _ in range(len(tokens))])
+        return np.asarray([self.kmeans.cluster_centers_[t] for t in tokens])
+    def visualize(self, figset=None):
+        if figset is None:
+            fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 16), dpi=300)
+        else:
+            fig, axes = figset
+        FONT = {"fontsize": 20}
+        axes[0].set_title("Center", fontdict=FONT)
+        axes[1].set_title("Center_Rot", fontdict=FONT)
+        labels = self.kmeans.labels_
+        centers = self.kmeans.cluster_centers_
+        # plot center. each center is given as (x, y, yaw). plot point (x,y) and arrow from (x,y) to p', with direction of yaw. consider (x, y)'s scale
+        scale_factor = 0.05
+        for i, center in enumerate(centers):
+            x, y, yaw = center
+            axes[0].plot(x, y, "ro")
+            axes[0].arrow(
+                x,
+                y,
+                np.cos(yaw) * scale_factor,
+                np.sin(yaw) * scale_factor,
+                head_width=scale_factor * 0.3,
+                head_length=scale_factor * 0.3,
+                fc="k",
+                ec="k",
+            )
+            axes[0].text(x, y, f"{i}", fontsize=10)
+        axes[0].axis("equal")
+        axes[0].grid(True)
+        # filter centers that are not far from origin in distance 0.3
+        _centers = centers[np.linalg.norm(centers[:, :2], axis=1) < 0.05]
+        # print(f"action near zero: {_centers}")
+        scale_factor = 0.1
+        for center in _centers:
+            x, y, yaw = center
+            axes[1].plot(x, y, "ro")
+            axes[1].arrow(
+                x,
+                y,
+                np.cos(yaw) * scale_factor,
+                np.sin(yaw) * scale_factor,
+                head_width=scale_factor * 0.3,
+                head_length=scale_factor * 0.3,
+                fc="k",
+                ec="k",
+            )
+        axes[1].axis("equal")
+        axes[1].grid(True)
+        return fig, axes
+class Idefics2PipelineConfig(BaseModelWithYamlJsonFromTo):
+    pipeline_class: str = "Idefics2Pipeline"
+    train_additional_cfg: Idefics2TrainAdditionalConfig
+class Idefics2Pipeline():
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        processor: ProcessorMixin,
+        action_tokenizer: KMeansActionTokenizer,
+        config: Idefics2PipelineConfig,
+    ):
+        self.model = model
+        self.processor = processor
+        self.action_tokenizer = action_tokenizer
+        self.config = config
+    def save_pretrained(
+        self,
+        save_directory: str,
+    ):
+        if not isinstance(save_directory, Path):
+            save_directory = Path(save_directory)
+        self.model.save_pretrained(save_directory)
+        self.processor.save_pretrained(save_directory)
+        self.action_tokenizer.save_pretrained(save_directory)
+        self.config.to_json(f"{save_directory}/pipeline_config.json")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str):
+        if not isinstance(pretrained_model_name_or_path, Path):
+            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+        config = Idefics2PipelineConfig.model_validate_json(
+            (pretrained_model_name_or_path / "pipeline_config.json").read_text()
+        )
+        model = Idefics2ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
+        processor = Idefics2Processor.from_pretrained(pretrained_model_name_or_path)
+        model.eval()
+        action_tokenizer = KMeansActionTokenizer.from_pretrained(pretrained_model_name_or_path)
+        return cls(model, processor, action_tokenizer, config)
+    def to(self, device):
+        return self.model.to(device)
+    @torch.no_grad()
+    def __call__(
+        self,
+        examples: list[dict],
+        return_traj: Optional[bool] = False,
+    ):
+        """
+        call model with examples
+        Args:
+            examples: list of example, [B, example]
+            return_traj: return trajectory if True
+        """
+        raise NotImplementedError("Not implemented yet")
+        # same as idefics2 data collator
+        texts = []
+        images = []
+        for example in examples:
+            image = example["images"]
+            messages = example["messages"]
+            text = self.processor.apply_chat_template(messages, add_generation_prompt=False)
+            texts.append(text.strip())
+            images.append(image)
+        inputs = self.processor(text=texts, images=images, return_tensors="pt", padding=True)
+        generate_ids = self.model.generate(**inputs, max_new_tokens=self.config.num_actions)
+        generated_text = self.processor.batch_decode(generate_ids, skip_special_tokens=True)
+        if return_traj:
+            return self.action_tokenizer.detokenize(generated_text)
+        else:
+            return generated_text
+    @torch.no_grad()
+    def __call__(
+        self,
+        message_list: list[list[dict]],
+        images_list: list[list[Image.Image]],
+        return_traj: Optional[bool] = False,
+    ):
+        """
+        call model with message and images
+        Args:
+            message_list: list of messages, [B, messages]
+            images_list: list of images, [B, images]
+            return_traj: return trajectory if True
+        """
+        # we don't use batch inference for run model worker
+        if len(message_list) != 1:
+            raise ValueError("No batch api call allowed for Idefics2Pipeline")
+        message = message_list[0]
+        images = images_list[0]
+        prompt = self.processor.apply_chat_template(message, add_generation_prompt=True)
+        prompt.replace("<end_of_utterance>", "")
+        # add space to match the training data
+        prompt = prompt + " "
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt", padding=True)
+        device = self.model.device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        generate_ids = self.model.generate(
+            **inputs, max_new_tokens=self.config.train_additional_cfg.num_actions, top_k=1
+        )
+        generated_texts = self.processor.batch_decode(generate_ids, skip_special_tokens=True)
+        if return_traj:
+            pred_action = re.findall(r"<ACTION_(\d+)>", generated_texts[0])
+            # pred_action = pred_action if len(pred_action) == self.config.num_actions else [-1] * self.config.num_actions
+            pred_action = np.array(pred_action, dtype=np.int64)
+            return self.action_tokenizer.detokenize(pred_action).tolist()
+        else:
+            return generated_texts