Yosemat
/

designvlm

@@ -35,6 +35,7 @@ from transformers import (
     StoppingCriteriaList,
     set_seed,
 )
 from transformers.generation.streamers import BaseStreamer
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import (
@@ -52,6 +53,8 @@ from .modeling_internlm2 import (
 )
 _CONFIG_FOR_DOC = "InternLMXcomposer2Config"
 image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
 video_extensions = {".mp4", ".avi", ".mkv", ".mov", ".wmv"}
@@ -103,7 +106,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         self.model = InternLM2Model(config)
         self.vocab_size = config.vocab_size
         self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.tokenizer = None
         self.hd_num = 25
         self.font = get_font()
@@ -245,12 +248,12 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         self.max_length = max_length
         prompt = ""
         if meta_instruction:
-            prompt += (
-                f"""[UNUSED_TOKEN_146]system\n{meta_instruction}[UNUSED_TOKEN_145]\n"""
-            )
         for record in history:
-            prompt += f"""[UNUSED_TOKEN_146]user\n{record[0]}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n{record[1]}[UNUSED_TOKEN_145]\n"""
-        prompt += f"""[UNUSED_TOKEN_146]user\n{query}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"""
         image_nums = len(image)
         if image_nums == 1 and prompt.find("<ImageHere>") == -1:
@@ -587,7 +590,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
             # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
@@ -676,12 +679,14 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
     ):
         prompt = ""
         if meta_instruction:
-            prompt += f"""<s>[UNUSED_TOKEN_146]system\n{meta_instruction}[UNUSED_TOKEN_145]\n"""
         else:
             prompt += "<s>"
         for record in history:
-            prompt += f"""[UNUSED_TOKEN_146]user\n{record[0]}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n{record[1]}[UNUSED_TOKEN_145]\n"""
-        prompt += f"""[UNUSED_TOKEN_146]user\n{query}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
@@ -724,7 +729,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
         eos_token_id = [
             tokenizer.eos_token_id,
-            tokenizer.convert_tokens_to_ids(["[UNUSED_TOKEN_145]"])[0],
         ]
         outputs = self.generate(
             **inputs,
@@ -745,7 +750,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         else:
             outputs = outputs[0].cpu().tolist()
         response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split("[UNUSED_TOKEN_145]")[0]
         history = history + [(query, response)]
         return response, history
@@ -807,8 +812,8 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
-        response = response.replace("[UNUSED_TOKEN_145]", "")
-        response = response.replace("[UNUSED_TOKEN_146]", "")
         return response
@@ -847,8 +852,8 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
-        response = response.replace("[UNUSED_TOKEN_145]", "")
-        out = response.replace("[UNUSED_TOKEN_146]", "")
         image_type = "random"
         pattern = r"""https://source\.unsplash\.com/random/(\d+)x(\d+)/\?([^'"]+)"""
         if image_type == "placeholder":
@@ -900,8 +905,8 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
-        response = response.replace("[UNUSED_TOKEN_145]", "")
-        html = response.replace("[UNUSED_TOKEN_146]", "")
         if seed != -1:
             set_random_seed(seed, set_cudnn=True)
@@ -923,8 +928,8 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
-        response = response.replace("[UNUSED_TOKEN_145]", "")
-        js = response.replace("[UNUSED_TOKEN_146]", "")
         if re.search(r"</script>", html):
             js = re.findall(r"<script>([\s\S]*?)<\/script>", js)
@@ -983,8 +988,8 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
-        response = response.replace("[UNUSED_TOKEN_145]", "")
-        out = response.replace("[UNUSED_TOKEN_146]", "")
         image_type = "random"
         pattern = r"""https://source\.unsplash\.com/random/(\d+)x(\d+)/\?([^'"]+)"""
         if image_type == "placeholder":
@@ -995,3 +1000,33 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
         with open(task.replace(" ", "_") + ".html", "w") as f:
             f.write(out)
         return out

     StoppingCriteriaList,
     set_seed,
 )
+from transformers import PreTrainedTokenizer
 from transformers.generation.streamers import BaseStreamer
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import (
 )
 _CONFIG_FOR_DOC = "InternLMXcomposer2Config"
+FROM_TOKEN_1 = "[UNUSED_TOKEN_146]"
+FROM_TOKEN_2 = "[UNUSED_TOKEN_145]"
 image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
 video_extensions = {".mp4", ".avi", ".mkv", ".mov", ".wmv"}
         self.model = InternLM2Model(config)
         self.vocab_size = config.vocab_size
         self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.tokenizer: PreTrainedTokenizer = None  # type: ignore
         self.hd_num = 25
         self.font = get_font()
         self.max_length = max_length
         prompt = ""
         if meta_instruction:
+            prompt += f"""{FROM_TOKEN_1}system\n{meta_instruction}{FROM_TOKEN_2}\n"""
         for record in history:
+            prompt += f"""{FROM_TOKEN_1}user\n{record[0]}{FROM_TOKEN_2}\n{FROM_TOKEN_1}assistant\n{record[1]}{FROM_TOKEN_2}\n"""
+        prompt += (
+            f"""{FROM_TOKEN_1}user\n{query}{FROM_TOKEN_2}\n{FROM_TOKEN_1}assistant\n"""
+        )
         image_nums = len(image)
         if image_nums == 1 and prompt.find("<ImageHere>") == -1:
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.vocab_size)
             shift_labels = shift_labels.view(-1)
             # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
     ):
         prompt = ""
         if meta_instruction:
+            prompt += f"""<s>{FROM_TOKEN_1}system\n{meta_instruction}{FROM_TOKEN_2}\n"""
         else:
             prompt += "<s>"
         for record in history:
+            prompt += f"""{FROM_TOKEN_1}user\n{record[0]}{FROM_TOKEN_2}\n{FROM_TOKEN_1}assistant\n{record[1]}{FROM_TOKEN_2}\n"""
+        prompt += (
+            f"""{FROM_TOKEN_1}user\n{query}{FROM_TOKEN_2}\n{FROM_TOKEN_1}assistant\n"""
+        )
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
         eos_token_id = [
             tokenizer.eos_token_id,
+            tokenizer.convert_tokens_to_ids([FROM_TOKEN_2])[0],
         ]
         outputs = self.generate(
             **inputs,
         else:
             outputs = outputs[0].cpu().tolist()
         response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split(FROM_TOKEN_2)[0]
         history = history + [(query, response)]
         return response, history
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
+        response = response.replace(FROM_TOKEN_2, "")
+        response = response.replace(FROM_TOKEN_1, "")
         return response
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
+        response = response.replace(FROM_TOKEN_2, "")
+        out = response.replace(FROM_TOKEN_1, "")
         image_type = "random"
         pattern = r"""https://source\.unsplash\.com/random/(\d+)x(\d+)/\?([^'"]+)"""
         if image_type == "placeholder":
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
+        response = response.replace(FROM_TOKEN_2, "")
+        html = response.replace(FROM_TOKEN_1, "")
         if seed != -1:
             set_random_seed(seed, set_cudnn=True)
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
+        response = response.replace(FROM_TOKEN_2, "")
+        js = response.replace(FROM_TOKEN_1, "")
         if re.search(r"</script>", html):
             js = re.findall(r"<script>([\s\S]*?)<\/script>", js)
         response = generate[0].tolist()
         response = self.tokenizer.decode(response, skip_special_tokens=True)  # type: ignore
         # remove eoa
+        response = response.replace(FROM_TOKEN_2, "")
+        out = response.replace(FROM_TOKEN_1, "")
         image_type = "random"
         pattern = r"""https://source\.unsplash\.com/random/(\d+)x(\d+)/\?([^'"]+)"""
         if image_type == "placeholder":
         with open(task.replace(" ", "_") + ".html", "w") as f:
             f.write(out)
         return out
+    def add_tokens(self, new_tokens: list[str]):
+        self.tokenizer.add_tokens(new_tokens)  # type: ignore
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        self.vocab_size = len(self.tokenizer)
+        # self.output needs to be resized accordingly but without loosing the weight
+        new_output = nn.Linear(
+            self.model.config.hidden_size,
+            self.vocab_size,
+            bias=False,
+            dtype=self.output.weight.dtype,
+            device=self.output.weight.device,
+        ).to(self.device)
+        new_output.weight.data[: self.output.weight.shape[0]] = self.output.weight.data
+        dummy_input_for_output = torch.zeros(
+            1,
+            1,
+            self.model.config.hidden_size,
+            device=new_output.weight.device,
+            dtype=new_output.weight.dtype,
+        ).type_as(new_output.weight)
+        # Check if output has same behavior
+        dummy_old_output: torch.Tensor = self.output(dummy_input_for_output)
+        dummy_new_output = new_output(dummy_input_for_output)
+        assert dummy_old_output.allclose(
+            dummy_new_output[:, :, : self.output.weight.shape[0]]
+        )
+        self.output = new_output