Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from torchvision.io import read_video | |
| import torch.nn.functional as F | |
| import torch, hiera | |
| df=pd.read_csv('Kinetic400.csv') | |
| model = hiera.hiera_base_16x224(pretrained=True, checkpoint="mae_k400_ft_k400") | |
| def recognize(vid): | |
| frames, audio, info = read_video(vid, pts_unit='sec', output_format='THWC') | |
| frames = frames.float() / 255 # Convert from byte to float | |
| frames = torch.stack([frames[:64], frames[64:128]], dim=0) | |
| frames = frames[:, ::4] # Sample every 4 frames | |
| frames = frames.permute(0, 4, 1, 2, 3).contiguous() | |
| frames = F.interpolate(frames, size=(16, 224, 224), mode="trilinear") | |
| torch.Size([2, 3, 16, 224, 224]) | |
| frames = frames - torch.tensor([0.45, 0.45, 0.45]).view(1, -1, 1, 1, 1) | |
| frames = frames / torch.tensor([0.225, 0.225, 0.255]).view(1, -1, 1, 1, 1) | |
| out = model(frames) | |
| out = out.mean(0) | |
| out1=out.argmax(dim=-1).item() | |
| out2=df.iloc[out1,1] | |
| return out2 | |
| demo = gr.Interface(fn=recognize, inputs=gr.Video(type="file"),outputs='text',examples= [['dog.mp4']]) | |
| demo.launch() |