deepseek-ai
/

DeepSeek-OCR

@@ -383,6 +383,7 @@ class DeepseekOCRModel(DeepseekV2Model):
         images_seq_mask: Optional[torch.FloatTensor] = None,
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
@@ -432,10 +433,11 @@ class DeepseekOCRModel(DeepseekV2Model):
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('PATCHES: ', local_features.shape)
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
@@ -475,10 +477,12 @@ class DeepseekOCRModel(DeepseekV2Model):
                         global_features_2 = vision_model(image_ori, global_features_1)
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('NO PATCHES')
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
@@ -700,7 +704,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
-    def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False, streamer=None):
         self.disable_torch_init()
         if len(output_path) > 0 :
@@ -926,7 +930,8 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         streamer=streamer,
                         max_new_tokens=8192,
                         no_repeat_ngram_size = 20,
-                        use_cache = True
                         )
         else:
@@ -943,7 +948,8 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         eos_token_id=tokenizer.eos_token_id,
                         max_new_tokens=8192,
                         no_repeat_ngram_size = 35,
-                        use_cache = True
                         )

         images_seq_mask: Optional[torch.FloatTensor] = None,
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
+        verbose: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
+                        if verbose:
+                            print('=====================')
+                            print('BASE: ', global_features.shape)
+                            print('PATCHES: ', local_features.shape)
+                            print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
                         global_features_2 = vision_model(image_ori, global_features_1)
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
+                        if verbose:
+                            print('=====================')
+                            print('BASE: ', global_features.shape)
+                            print('NO PATCHES')
+                            print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
+    def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False, streamer=None, verbose=True):
         self.disable_torch_init()
         if len(output_path) > 0 :
                         streamer=streamer,
                         max_new_tokens=8192,
                         no_repeat_ngram_size = 20,
+                        use_cache = True,
+                        verbose = verbose
                         )
         else:
                         eos_token_id=tokenizer.eos_token_id,
                         max_new_tokens=8192,
                         no_repeat_ngram_size = 35,
+                        use_cache = True,
+                        verbose = verbose
                         )