IPEC-COMMUNITY
/

eo1-qwen2_5_vl

@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union
 import numpy as np
 import torch
 from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.datasets.utils import cast_stats_to_numpy
 from lerobot.policies.normalize import Normalize, Unnormalize
 from transformers.feature_extraction_utils import BatchFeature
@@ -32,6 +34,8 @@ from transformers.processing_utils import (
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.video_utils import VideoInput
 """constants"""
 DEFAULT_IMAGE_TOKEN = "<|image_pad|>"
 DEFAULT_VIDEO_TOKEN = "<|video_pad|>"
@@ -48,8 +52,8 @@ DEFAULT_STATE_TOKEN = "<|state_pad|>"
 STATE_END_TOKEN = "<|state_end|>"
 TASK_VLA_TOKEN = "<|vla|>"
 RobotInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]
-RobotIDInput = Union[str, list[str]]
 class EO1VisionVideosProcessorKwargs(VideosKwargs, total=False):
@@ -99,22 +103,14 @@ class EO1VisionProcessor(ProcessorMixin):
         robot_config=None,
         **kwargs,
     ):
-        self.image_token = (
-            DEFAULT_IMAGE_TOKEN if not hasattr(tokenizer, "image_token") else tokenizer.image_token
-        )
-        self.video_token = (
-            DEFAULT_VIDEO_TOKEN if not hasattr(tokenizer, "video_token") else tokenizer.video_token
-        )
-        self.action_token = (
-            DEFAULT_ACTION_TOKEN if not hasattr(tokenizer, "action_token") else tokenizer.action_token
-        )
-        self.state_token = (
-            DEFAULT_STATE_TOKEN if not hasattr(tokenizer, "state_token") else tokenizer.state_token
-        )
         # robot policy
         self.action_token_id = tokenizer.convert_tokens_to_ids(DEFAULT_ACTION_TOKEN) or 151666
-        self.action_pass_id = tokenizer.convert_tokens_to_ids(PASS_ACTION_TOKEN) or 151672
         self.robot_config = robot_config or {}
         self.set_normalization(self.robot_config)
@@ -126,15 +122,14 @@ class EO1VisionProcessor(ProcessorMixin):
             robot_config.get("stats"),
             robot_config.get("state_mode"),
         )
-        if features is None or stats is None or state_mode is None:
             return
         else:
             normalization_mapping = {
                 "STATE": NormalizationMode(state_mode),
                 "ACTION": NormalizationMode(state_mode),
             }
-            self.robot_config = dict(robot_config)
-            self.normalize_inputs, self.unnormalize_outputs = {}, {}
             for repo_id, fea in features.items():
                 stat = cast_stats_to_numpy(stats[repo_id])
                 fea = dataset_to_policy_features(fea)
@@ -142,12 +137,11 @@ class EO1VisionProcessor(ProcessorMixin):
                 input_features = {k: v for k, v in fea.items() if v.type == FeatureType.STATE}
                 output_features = {k: v for k, v in fea.items() if v.type == FeatureType.ACTION}
-                self.normalize_inputs[repo_id] = Normalize(input_features, normalization_mapping, stat)
-                self.unnormalize_outputs[repo_id] = Unnormalize(output_features, normalization_mapping, stat)
-                self.select_video_keys = robot_config.get("select_video_keys")
-                self.select_state_keys = robot_config.get("select_state_keys")
-                self.select_action_keys = robot_config.get("select_action_keys")
     def __call__(
         self,
@@ -233,7 +227,6 @@ class EO1VisionProcessor(ProcessorMixin):
                 )
             text[i] = text[i].replace("<|placeholder|>", self.action_token)
-        # state tokens
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
         if return_mm_token_type_ids:
@@ -244,10 +237,11 @@ class EO1VisionProcessor(ProcessorMixin):
         # robot inputs
         robot_inputs = {}
         if states is not None:
             if isinstance(states, list):
                 states = torch.stack(states, dim=0)
-            if states.ndim == 2:
                 states = states.unsqueeze(0)
             robot_inputs.update({"states": states})
@@ -267,22 +261,31 @@ class EO1VisionProcessor(ProcessorMixin):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-        return names_from_processor + ["second_per_grid_ts"] + ["actions"]
     @torch.no_grad
-    def select_action(self, model, batch: dict, **kwargs):
-        # normalize batch
         batch_messages = []
         batch_states = []
         max_state_dim = self.robot_config.get("max_state_dim", 32)
-        # normalize robot inputs
-        for i, repo_id in enumerate(batch["repo_id"]):
             mini_batch = {k: v[i] for k, v in batch.items()}
             normalize_inputs = self.normalize_inputs[repo_id]
-            select_video_keys = self.select_video_keys[repo_id]
-            select_state_keys = self.select_state_keys[repo_id]
             for k in normalize_inputs.features:
                 if not isinstance(mini_batch[k], torch.Tensor):
@@ -296,31 +299,20 @@ class EO1VisionProcessor(ProcessorMixin):
                     "role": "user",
                     "content": [
                         *({"type": "image", "image": mini_batch[k]} for k in select_video_keys),
-                        {"type": "state", "state": states},
-                        {"type": "text", "text": f"{mini_batch['task']}{TASK_VLA_TOKEN}"},  # add task token
                     ],
                 }
             ]
             batch_messages += [messages]
-        inputs = self.apply_chat_template(
-            batch_messages,
-            states=batch_states,
-            add_generation_prompt=True,
-            noise_prompt=f"{ACTION_START_TOKEN}{DEFAULT_ACTION_TOKEN}{ACTION_END_TOKEN}",
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        ).to(model.device)
-        outputs = model.generate(**inputs, max_new_tokens=128, return_dict_in_generate=True)
-        actions = outputs.actions.cpu()
-        # unnormalize actions
         output_actions = []
-        for i, repo_id in enumerate(batch["repo_id"]):
             unnormalize_outputs = self.unnormalize_outputs[repo_id]
-            select_action_keys = self.select_action_keys[repo_id]
             features = unnormalize_outputs.features
             cum_dims = [0] + np.cumsum([features[k].shape[0] for k in select_action_keys]).tolist()
             origin_action = torch.tensor(actions[i], dtype=torch.float32)[..., : cum_dims[-1]]
@@ -331,7 +323,25 @@ class EO1VisionProcessor(ProcessorMixin):
             unnorm_actions = torch.concat([unnorm_actions[k] for k in select_action_keys], -1)
             output_actions.append(unnorm_actions)
         output_actions = torch.stack(output_actions, dim=0)
         return BatchFeature({"action": output_actions})

 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from typing import Union
 import numpy as np
 import torch
 from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.constants import OBS_STATE
 from lerobot.datasets.utils import cast_stats_to_numpy
 from lerobot.policies.normalize import Normalize, Unnormalize
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.video_utils import VideoInput
+os.environ["TOKENIZERS_PARALLELISM"] = "0"
 """constants"""
 DEFAULT_IMAGE_TOKEN = "<|image_pad|>"
 DEFAULT_VIDEO_TOKEN = "<|video_pad|>"
 STATE_END_TOKEN = "<|state_end|>"
 TASK_VLA_TOKEN = "<|vla|>"
 RobotInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]
 class EO1VisionVideosProcessorKwargs(VideosKwargs, total=False):
         robot_config=None,
         **kwargs,
     ):
+        self.image_token = getattr(tokenizer, "image_token", DEFAULT_IMAGE_TOKEN)
+        self.video_token = getattr(tokenizer, "video_token", DEFAULT_VIDEO_TOKEN)
+        self.action_token = getattr(tokenizer, "action_token", DEFAULT_ACTION_TOKEN)
+        self.state_token = getattr(tokenizer, "state_token", DEFAULT_STATE_TOKEN)
         # robot policy
         self.action_token_id = tokenizer.convert_tokens_to_ids(DEFAULT_ACTION_TOKEN) or 151666
+        self.action_pass_id = tokenizer.convert_tokens_to_ids(PASS_ACTION_TOKEN) or 151667
         self.robot_config = robot_config or {}
         self.set_normalization(self.robot_config)
             robot_config.get("stats"),
             robot_config.get("state_mode"),
         )
+        if None in [features, stats, state_mode]:
             return
         else:
             normalization_mapping = {
                 "STATE": NormalizationMode(state_mode),
                 "ACTION": NormalizationMode(state_mode),
             }
+            normalize_inputs, unnormalize_outputs = {}, {}
             for repo_id, fea in features.items():
                 stat = cast_stats_to_numpy(stats[repo_id])
                 fea = dataset_to_policy_features(fea)
                 input_features = {k: v for k, v in fea.items() if v.type == FeatureType.STATE}
                 output_features = {k: v for k, v in fea.items() if v.type == FeatureType.ACTION}
+                normalize_inputs[repo_id] = Normalize(input_features, normalization_mapping, stat)
+                unnormalize_outputs[repo_id] = Unnormalize(output_features, normalization_mapping, stat)
+            self.robot_config = dict(robot_config)
+            self.normalize_inputs, self.unnormalize_outputs = normalize_inputs, unnormalize_outputs
     def __call__(
         self,
                 )
             text[i] = text[i].replace("<|placeholder|>", self.action_token)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
         if return_mm_token_type_ids:
         # robot inputs
         robot_inputs = {}
         if states is not None:
             if isinstance(states, list):
                 states = torch.stack(states, dim=0)
+            if states.ndim == 1:
                 states = states.unsqueeze(0)
             robot_inputs.update({"states": states})
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"] + ["states", "actions"]
     @torch.no_grad
+    def _prepare_robot_inputs(self, batch: dict):
+        """Prepare model inputs from raw robot batch"""
         batch_messages = []
         batch_states = []
         max_state_dim = self.robot_config.get("max_state_dim", 32)
+        state_keys = [x for x in batch.keys() if x.startswith(OBS_STATE)]
+        batch_size = len(batch[state_keys[0]])
+        if "repo_id" in batch:
+            repo_ids = batch.pop("repo_id")
+        else:
+            print("no repo_id found, use the first one in normalize_inputs")
+            repo_ids = list(self.normalize_inputs.keys())[0]
+        repo_ids = [repo_ids] * batch_size if isinstance(repo_ids, str) else repo_ids
+        for i, repo_id in enumerate(repo_ids):
             mini_batch = {k: v[i] for k, v in batch.items()}
             normalize_inputs = self.normalize_inputs[repo_id]
+            select_video_keys = self.robot_config["select_video_keys"][repo_id]
+            select_state_keys = self.robot_config["select_state_keys"][repo_id]
             for k in normalize_inputs.features:
                 if not isinstance(mini_batch[k], torch.Tensor):
                     "role": "user",
                     "content": [
                         *({"type": "image", "image": mini_batch[k]} for k in select_video_keys),
+                        {"type": "state", "state": []},  # chat template state token
+                        {"type": "text", "text": f"{mini_batch['task']}{TASK_VLA_TOKEN}"},
                     ],
                 }
             ]
             batch_messages += [messages]
+        return batch_messages, batch_states, repo_ids
+    def _process_robot_outputs(self, repo_ids: list[str], actions: torch.Tensor):
+        """Process model outputs back to robot format"""
         output_actions = []
+        for i, repo_id in enumerate(repo_ids):
             unnormalize_outputs = self.unnormalize_outputs[repo_id]
+            select_action_keys = self.robot_config["select_action_keys"][repo_id]
             features = unnormalize_outputs.features
             cum_dims = [0] + np.cumsum([features[k].shape[0] for k in select_action_keys]).tolist()
             origin_action = torch.tensor(actions[i], dtype=torch.float32)[..., : cum_dims[-1]]
             unnorm_actions = torch.concat([unnorm_actions[k] for k in select_action_keys], -1)
             output_actions.append(unnorm_actions)
         output_actions = torch.stack(output_actions, dim=0)
+        return output_actions
+    @torch.no_grad
+    def select_action(self, model, batch: dict, **kwargs):
+        batch_messages, batch_states, repo_ids = self._prepare_robot_inputs(batch)
+        noise_prompt = f"{ACTION_START_TOKEN}{DEFAULT_ACTION_TOKEN}{ACTION_END_TOKEN}"
+        inputs = self.apply_chat_template(
+            batch_messages,
+            states=batch_states,
+            add_generation_prompt=True,
+            noise_prompt=noise_prompt,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device)
+        actions = model.sample_actions(**inputs)[0].cpu()
+        output_actions = self._process_robot_outputs(repo_ids, actions)
         return BatchFeature({"action": output_actions})