Upload files with `vila-upload`.
Browse filesUpload mm_utils.py
Upload siglip_encoder.py
- mm_utils.py +1 -1
- siglip_encoder.py +6 -2
    	
        mm_utils.py
    CHANGED
    
    | @@ -26,7 +26,7 @@ import torch | |
| 26 | 
             
            from PIL import Image
         | 
| 27 | 
             
            from transformers import StoppingCriteria
         | 
| 28 |  | 
| 29 | 
            -
            from  | 
| 30 |  | 
| 31 |  | 
| 32 | 
             
            def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
         | 
|  | |
| 26 | 
             
            from PIL import Image
         | 
| 27 | 
             
            from transformers import StoppingCriteria
         | 
| 28 |  | 
| 29 | 
            +
            from .constants import DEFAULT_IMAGE_TOKEN
         | 
| 30 |  | 
| 31 |  | 
| 32 | 
             
            def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
         | 
    	
        siglip_encoder.py
    CHANGED
    
    | @@ -19,12 +19,16 @@ import torch.nn as nn | |
| 19 | 
             
            import torch.nn.functional as F
         | 
| 20 | 
             
            from accelerate.hooks import add_hook_to_module
         | 
| 21 | 
             
            from einops import rearrange
         | 
| 22 | 
            -
             | 
| 23 | 
             
            from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
         | 
| 24 | 
             
            from transformers.image_processing_utils import BaseImageProcessor
         | 
| 25 | 
            -
            from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
         | 
| 26 | 
             
            from transformers.models.siglip import SiglipVisionModel
         | 
| 27 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 28 |  | 
| 29 | 
             
            class VisionTower(nn.Module):
         | 
| 30 | 
             
                def __init__(self, vision_tower, args, delay_load=False):
         | 
|  | |
| 19 | 
             
            import torch.nn.functional as F
         | 
| 20 | 
             
            from accelerate.hooks import add_hook_to_module
         | 
| 21 | 
             
            from einops import rearrange
         | 
| 22 | 
            +
             | 
| 23 | 
             
            from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
         | 
| 24 | 
             
            from transformers.image_processing_utils import BaseImageProcessor
         | 
|  | |
| 25 | 
             
            from transformers.models.siglip import SiglipVisionModel
         | 
| 26 |  | 
| 27 | 
            +
            from s2wrapper import forward as multiscale_forward
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            # from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
         | 
| 30 | 
            +
            def is_deepspeed_zero3_enabled():
         | 
| 31 | 
            +
                return False
         | 
| 32 |  | 
| 33 | 
             
            class VisionTower(nn.Module):
         | 
| 34 | 
             
                def __init__(self, vision_tower, args, delay_load=False):
         | 

