Commit 
							
							·
						
						2e8ba46
	
1
								Parent(s):
							
							e7c16cf
								
Upload processor
Browse files- preprocessor_config.json +3 -0
- processor.py +43 -0
- tokenizer_config.json +3 -0
    	
        preprocessor_config.json
    CHANGED
    
    | @@ -1,4 +1,7 @@ | |
| 1 | 
             
            {
         | 
|  | |
|  | |
|  | |
| 2 | 
             
              "crop_size": {
         | 
| 3 | 
             
                "height": 224,
         | 
| 4 | 
             
                "width": 224
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "auto_map": {
         | 
| 3 | 
            +
                "AutoProcessor": "processor.GIAProcessor"
         | 
| 4 | 
            +
              },
         | 
| 5 | 
             
              "crop_size": {
         | 
| 6 | 
             
                "height": 224,
         | 
| 7 | 
             
                "width": 224
         | 
    	
        processor.py
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from itertools import chain
         | 
| 2 | 
            +
            from transformers import GitProcessor
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            class GIAProcessor(GitProcessor):
         | 
| 5 | 
            +
                def __init__(self, image_processor, tokenizer):
         | 
| 6 | 
            +
                    super().__init__(image_processor, tokenizer)
         | 
| 7 | 
            +
                    self._block_size = 1024
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def _group_texts(self, examples):
         | 
| 10 | 
            +
                    # Concatenate all texts.
         | 
| 11 | 
            +
                    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         | 
| 12 | 
            +
                    total_length = len(concatenated_examples[list(examples.keys())[0]])
         | 
| 13 | 
            +
                    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
         | 
| 14 | 
            +
                    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
         | 
| 15 | 
            +
                    total_length = (total_length // self._block_size) * self._block_size
         | 
| 16 | 
            +
                    # Split by chunks of max_len.
         | 
| 17 | 
            +
                    result = {
         | 
| 18 | 
            +
                        k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
         | 
| 19 | 
            +
                        for k, t in concatenated_examples.items()
         | 
| 20 | 
            +
                    }
         | 
| 21 | 
            +
                    return result
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         | 
| 24 | 
            +
                    if text is not None and images is None:
         | 
| 25 | 
            +
                        encoded_text = self.tokenizer(text, return_tensors=return_tensors)
         | 
| 26 | 
            +
                        encoding = self._group_texts(encoded_text)
         | 
| 27 | 
            +
                    elif text is not None and images is not None:
         | 
| 28 | 
            +
                        encoding = super().__call__(text, images, return_tensors, **kwargs)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    return encoding
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def batch_decode(self, *args, **kwargs):
         | 
| 33 | 
            +
                    return self.tokenizer.batch_decode(*args, **kwargs)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def decode(self, *args, **kwargs):
         | 
| 36 | 
            +
                    return self.tokenizer.decode(*args, **kwargs)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                @property
         | 
| 39 | 
            +
                def model_input_names(self):
         | 
| 40 | 
            +
                    return ["input_ids", "attention_mask", "pixel_values"]
         | 
| 41 | 
            +
             | 
| 42 | 
            +
             | 
| 43 | 
            +
            GIAProcessor.register_for_auto_class("AutoProcessor")
         | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -1,4 +1,7 @@ | |
| 1 | 
             
            {
         | 
|  | |
|  | |
|  | |
| 2 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 3 | 
             
              "cls_token": "[CLS]",
         | 
| 4 | 
             
              "do_lower_case": true,
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "auto_map": {
         | 
| 3 | 
            +
                "AutoProcessor": "processor.GIAProcessor"
         | 
| 4 | 
            +
              },
         | 
| 5 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 6 | 
             
              "cls_token": "[CLS]",
         | 
| 7 | 
             
              "do_lower_case": true,
         | 
