Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +1 -1
config.json +3 -60
configuration_internvl_chat.py +0 -3
conversation.py +9 -2
modeling_internlm2.py +1 -0
modeling_internvl_chat.py +28 -10

README.md CHANGED Viewed

@@ -39,7 +39,7 @@ For lmdeploy v0.5.0, please configure the chat template config first. Create the
 ```json
 {
     "model_name":"internlm2",
-    "meta_instruction":"你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。",
     "stop_words":["<|im_start|>", "<|im_end|>"]
 }
 ```

 ```json
 {
     "model_name":"internlm2",
+    "meta_instruction":"我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。",
     "stop_words":["<|im_start|>", "<|im_end|>"]
 }
 ```

config.json CHANGED Viewed

@@ -1,6 +1,5 @@
 {
   "_commit_hash": null,
-  "_name_or_path": "/nvme/shared/InternVL2-2B",
   "architectures": [
     "InternVLChatModel"
   ],
@@ -13,7 +12,7 @@
   "dynamic_image_size": true,
   "force_image_size": 448,
   "llm_config": {
-    "_name_or_path": "./pretrained/internlm2-chat-1_8b",
     "add_cross_attention": false,
     "architectures": [
       "InternLM2ForCausalLM"
@@ -96,7 +95,7 @@
     "tie_word_embeddings": false,
     "tokenizer_class": null,
     "top_k": 50,
-    "top_p": 1.0,
     "torch_dtype": "bfloat16",
     "torchscript": false,
     "transformers_version": "4.40.1",
@@ -108,96 +107,40 @@
   "max_dynamic_patch": 12,
   "min_dynamic_patch": 1,
   "model_type": "internvl_chat",
-  "pad2square": false,
   "ps_version": "v2",
   "select_layer": -1,
   "template": "internlm2-chat",
   "torch_dtype": "float16",
-  "transformers_version": null,
   "use_backbone_lora": 0,
   "use_llm_lora": 0,
   "use_thumbnail": true,
   "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
     "architectures": [
       "InternVisionModel"
     ],
     "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "drop_path_rate": 0.1,
     "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
     "hidden_act": "gelu",
     "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
     "image_size": 448,
     "initializer_factor": 1.0,
     "initializer_range": 0.02,
     "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
     "layer_norm_eps": 1e-06,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
     "model_type": "intern_vit_6b",
-    "no_repeat_ngram_size": 0,
     "norm_type": "layer_norm",
     "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
     "num_channels": 3,
     "num_hidden_layers": 24,
-    "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
     "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
     "qk_normalization": false,
     "qkv_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
     "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
     "torch_dtype": "bfloat16",
-    "torchscript": false,
     "transformers_version": "4.40.1",
-    "typical_p": 1.0,
     "use_bfloat16": true,
     "use_flash_attn": true
   }

 {
   "_commit_hash": null,
   "architectures": [
     "InternVLChatModel"
   ],
   "dynamic_image_size": true,
   "force_image_size": 448,
   "llm_config": {
+    "_name_or_path": "internlm/internlm2-chat-1_8b",
     "add_cross_attention": false,
     "architectures": [
       "InternLM2ForCausalLM"
     "tie_word_embeddings": false,
     "tokenizer_class": null,
     "top_k": 50,
+    "top_p": null,
     "torch_dtype": "bfloat16",
     "torchscript": false,
     "transformers_version": "4.40.1",
   "max_dynamic_patch": 12,
   "min_dynamic_patch": 1,
   "model_type": "internvl_chat",
   "ps_version": "v2",
   "select_layer": -1,
   "template": "internlm2-chat",
   "torch_dtype": "float16",
   "use_backbone_lora": 0,
   "use_llm_lora": 0,
   "use_thumbnail": true,
   "vision_config": {
     "architectures": [
       "InternVisionModel"
     ],
     "attention_dropout": 0.0,
+    "drop_path_rate": 0.0,
     "dropout": 0.0,
     "hidden_act": "gelu",
     "hidden_size": 1024,
     "image_size": 448,
     "initializer_factor": 1.0,
     "initializer_range": 0.02,
     "intermediate_size": 4096,
     "layer_norm_eps": 1e-06,
     "model_type": "intern_vit_6b",
     "norm_type": "layer_norm",
     "num_attention_heads": 16,
     "num_channels": 3,
     "num_hidden_layers": 24,
     "output_attentions": false,
     "output_hidden_states": false,
     "patch_size": 14,
     "qk_normalization": false,
     "qkv_bias": true,
     "return_dict": true,
     "torch_dtype": "bfloat16",
     "transformers_version": "4.40.1",
     "use_bfloat16": true,
     "use_flash_attn": true
   }

configuration_internvl_chat.py CHANGED Viewed

@@ -26,7 +26,6 @@ class InternVLChatConfig(PretrainedConfig):
             llm_config=None,
             use_backbone_lora=0,
             use_llm_lora=0,
-            pad2square=False,
             select_layer=-1,
             force_image_size=None,
             downsample_ratio=0.5,
@@ -56,7 +55,6 @@ class InternVLChatConfig(PretrainedConfig):
             raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
-        self.pad2square = pad2square
         self.select_layer = select_layer
         self.force_image_size = force_image_size
         self.downsample_ratio = downsample_ratio
@@ -85,7 +83,6 @@ class InternVLChatConfig(PretrainedConfig):
         output['model_type'] = self.__class__.model_type
         output['use_backbone_lora'] = self.use_backbone_lora
         output['use_llm_lora'] = self.use_llm_lora
-        output['pad2square'] = self.pad2square
         output['select_layer'] = self.select_layer
         output['force_image_size'] = self.force_image_size
         output['downsample_ratio'] = self.downsample_ratio

             llm_config=None,
             use_backbone_lora=0,
             use_llm_lora=0,
             select_layer=-1,
             force_image_size=None,
             downsample_ratio=0.5,
             raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
         self.select_layer = select_layer
         self.force_image_size = force_image_size
         self.downsample_ratio = downsample_ratio
         output['model_type'] = self.__class__.model_type
         output['use_backbone_lora'] = self.use_backbone_lora
         output['use_llm_lora'] = self.use_llm_lora
         output['select_layer'] = self.select_layer
         output['force_image_size'] = self.force_image_size
         output['downsample_ratio'] = self.downsample_ratio

conversation.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Conversation prompt templates.
 We kindly request that you import fastchat instead of copying this file if you wish to use it.
-If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
 """
 import dataclasses
@@ -330,10 +330,13 @@ def get_conv_template(name: str) -> Conversation:
     return conv_templates[name].copy()
 register_conv_template(
     Conversation(
         name='Hermes-2',
         system_template='<|im_start|>system\n{system_message}',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
@@ -343,7 +346,7 @@ register_conv_template(
             6,
             7,
             8,
-        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
         stop_str='<|endoftext|>',
     )
 )
@@ -353,6 +356,8 @@ register_conv_template(
     Conversation(
         name='internlm2-chat',
         system_template='<|im_start|>system\n{system_message}',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
@@ -370,6 +375,8 @@ register_conv_template(
     Conversation(
         name='phi3-chat',
         system_template='<|system|>\n{system_message}',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|user|>\n', '<|assistant|>\n'),
         sep_style=SeparatorStyle.MPT,

 Conversation prompt templates.
 We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
 """
 import dataclasses
     return conv_templates[name].copy()
+# Note that for inference, using the Hermes-2 and internlm2-chat templates is equivalent.
 register_conv_template(
     Conversation(
         name='Hermes-2',
         system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
             6,
             7,
             8,
+        ],
         stop_str='<|endoftext|>',
     )
 )
     Conversation(
         name='internlm2-chat',
         system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
     Conversation(
         name='phi3-chat',
         system_template='<|system|>\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
         system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
         roles=('<|user|>\n', '<|assistant|>\n'),
         sep_style=SeparatorStyle.MPT,

modeling_internlm2.py CHANGED Viewed

@@ -709,6 +709,7 @@ class InternLM2PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ['InternLM2DecoderLayer']
     _skip_keys_device_placement = 'past_key_values'
     def _init_weights(self, module):
         std = self.config.initializer_range

     supports_gradient_checkpointing = True
     _no_split_modules = ['InternLM2DecoderLayer']
     _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
     def _init_weights(self, module):
         std = self.config.initializer_range

modeling_internvl_chat.py CHANGED Viewed

@@ -7,6 +7,7 @@ import warnings
 from typing import Any, List, Optional, Tuple, Union
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
@@ -23,6 +24,14 @@ from .modeling_internlm2 import InternLM2ForCausalLM
 logger = logging.get_logger(__name__)
 class InternVLChatModel(PreTrainedModel):
     config_class = InternVLChatConfig
     main_input_name = 'pixel_values'
@@ -31,6 +40,7 @@ class InternVLChatModel(PreTrainedModel):
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
         image_size = config.force_image_size or config.vision_config.image_size
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
@@ -183,36 +193,44 @@ class InternVLChatModel(PreTrainedModel):
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
-    def batch_chat(self, tokenizer, pixel_values, num_patches_list, questions, generation_config, history=None,
-                         return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
-                         IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
         if history is not None or return_history:
             print('Now multi-turn chat is not supported in batch_chat.')
             raise NotImplementedError
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
-        from .conversation import get_conv_template
         queries = []
-        if verbose:
-            image_bs = pixel_values.shape[0]
-            print(f'dynamic ViT batch size: {image_bs}, num_patches_list: {num_patches_list}')
         for idx, num_patches in enumerate(num_patches_list):
-            image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
-            question = image_token + '\n' + questions[idx]
             template = get_conv_template(self.template)
             template.append_message(template.roles[0], question)
             template.append_message(template.roles[1], None)
             query = template.get_prompt()
             queries.append(query)
         tokenizer.padding_side = 'left'
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
         eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
             input_ids=input_ids,

 from typing import Any, List, Optional, Tuple, Union
 import torch.utils.checkpoint
+import transformers
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
 logger = logging.get_logger(__name__)
+def version_cmp(v1, v2, op='eq'):
+    import operator
+    from packaging import version
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
 class InternVLChatModel(PreTrainedModel):
     config_class = InternVLChatConfig
     main_input_name = 'pixel_values'
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
+        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
         image_size = config.force_image_size or config.vision_config.image_size
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
+    def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
+                   history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
+                   IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
         if history is not None or return_history:
             print('Now multi-turn chat is not supported in batch_chat.')
             raise NotImplementedError
+        if image_counts is not None:
+            num_patches_list = image_counts
+            print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            print(f'dynamic ViT batch size: {image_bs}')
         queries = []
         for idx, num_patches in enumerate(num_patches_list):
+            question = questions[idx]
+            if pixel_values is not None and '<image>' not in question:
+                question = '<image>\n' + question
             template = get_conv_template(self.template)
             template.append_message(template.roles[0], question)
             template.append_message(template.roles[1], None)
             query = template.get_prompt()
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace('<image>', image_tokens, 1)
             queries.append(query)
         tokenizer.padding_side = 'left'
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
         eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
             input_ids=input_ids,