andrewdbrown
/

GOT-OCR2_Modified_load_image

Safetensors

GOT

custom_code

Model card Files Files and versions

xet

Community

KpLBaTMaN commited on Feb 7

Commit

90aefec

1 Parent(s): 8e8b282

code

Browse files

Files changed (1) hide show

modeling_GOT.py +66 -59

modeling_GOT.py CHANGED Viewed

@@ -15,6 +15,7 @@ import dataclasses
 import numpy as np
 import cv2
 from io import BytesIO
 ###
 DEFAULT_IMAGE_TOKEN = "<image>"
@@ -501,15 +502,24 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         self.disable_torch_init()
-        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         if gradio_input:
@@ -518,7 +528,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
@@ -527,13 +537,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
             if len(bbox) == 4:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
-                bbox[2] = int(bbox[2]/w*1000)
-                bbox[3] = int(bbox[3]/h*1000)
             if ocr_type == 'format':
                 qs = str(bbox) + ' ' + 'OCR with format: '
             else:
@@ -546,15 +556,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 qs = '[' + ocr_color + ']' + ' ' + 'OCR: '
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
-        You should follow the instructions carefully and explain your answers in detail.""",
-            # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
             messages=(),
@@ -572,43 +580,47 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             print(prompt)
         inputs = tokenizer([prompt])
-        image_tensor_1 = image_processor_high(image)
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
-                    # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
@@ -622,46 +634,44 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 import verovio
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
-                tk.setOptions({"pageWidth": 2100, "footer": 'none',
-            'barLineWidth': 0.5, 'beamMaxSlope': 15,
-            'staffLineWidth': 0.2, 'spacingStaff': 6})
                 tk.getPageCount()
                 svg = tk.renderToSVG()
                 svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
                 svg_to_html(svg, save_render_file)
             if ocr_type == 'format' and '**kern' not in outputs:
-                if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
-                    left_num = outputs.count('\left')
                     if right_num != left_num:
-                        outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
-                        gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
                     gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
-                    new_web = lines[0] + 'const text ='  + gt  + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
                     outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
                         if out:
                             if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
@@ -669,7 +679,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
@@ -677,14 +686,12 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                                         gt += out + '\n'
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
                 with open(html_path_2, 'w') as web_f_new:
                     web_f_new.write(new_web)
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):

 import numpy as np
 import cv2
 from io import BytesIO
+import contextlib
 ###
 DEFAULT_IMAGE_TOKEN = "<image>"
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def chat(
+        self,
+        tokenizer,
+        image_file,
+        ocr_type,
+        ocr_box='',
+        ocr_color='',
+        render=False,
+        save_render_file=None,
+        print_prompt=False,
+        gradio_input=False,
+        stream_flag=False,
+        device="cuda"  # new parameter to specify the device
+    ):
         self.disable_torch_init()
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         if gradio_input:
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
             if len(bbox) == 4:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
+                bbox[2] = int(bbox[2] / w * 1000)
+                bbox[3] = int(bbox[3] / h * 1000)
             if ocr_type == 'format':
                 qs = str(bbox) + ' ' + 'OCR with format: '
             else:
                 qs = '[' + ocr_color + ']' + ' ' + 'OCR: '
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
+            You should follow the instructions carefully and explain your answers in detail.""",
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
             messages=(),
             print(prompt)
         inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).to(device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        image_tensor_1 = image_processor_high(image)
+        # Use autocast only when on CUDA, otherwise use a null context for CPU
+        if device == "cuda":
+            autocast_context = torch.autocast("cuda", dtype=torch.bfloat16)
+        else:
+            autocast_context = contextlib.nullcontext()
         if stream_flag:
+            with autocast_context:
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(device)],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         else:
+            with autocast_context:
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(device)],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
                 import verovio
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
+                tk.setOptions({
+                    "pageWidth": 2100,
+                    "footer": 'none',
+                    'barLineWidth': 0.5,
+                    'beamMaxSlope': 15,
+                    'staffLineWidth': 0.2,
+                    'spacingStaff': 6
+                })
                 tk.getPageCount()
                 svg = tk.renderToSVG()
                 svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
                 svg_to_html(svg, save_render_file)
             if ocr_type == 'format' and '**kern' not in outputs:
+                if '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
+                    left_num = outputs.count('\\left')
                     if right_num != left_num:
+                        outputs = outputs.replace('\left(', '(').replace('\\right)', ')')\
+                                        .replace('\left[', '[').replace('\\right]', ']')\
+                                        .replace('\left{', '{').replace('\\right}', '}')\
+                                        .replace('\left|', '|').replace('\\right|', '|')\
+                                        .replace('\left.', '.').replace('\\right.', '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
+                    gt = ''
                     for out in outputs_list:
+                        gt += '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
                     gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
+                    new_web = lines[0] + 'const text =' + gt + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
                     outputs_list = outputs.split('\n')
+                    gt = ''
                     for out in outputs_list:
                         if out:
                             if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
                                         gt += out + '\n'
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
                 with open(html_path_2, 'w') as web_f_new:
                     web_f_new.write(new_web)
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):