mlfoundations · iejMac · Feb 15, 2023 · Feb 16, 2023 · Feb 18, 2023 · Feb 18, 2023
diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
@@ -8,15 +8,17 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 
 from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
     resize_pos_embed, get_cast_dtype
 from .coca_model import CoCa
+from .video_model import VideoCLIP # TODO: change once full model is implemented
 from .loss import ClipLoss, DistillClipLoss, CoCaLoss
 from .openai import load_openai_model
 from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model, download_pretrained_from_hf
-from .transform import image_transform, AugmentationCfg
+from .transform import image_transform, video_transform, AugmentationCfg
 from .tokenizer import HFTokenizer, tokenize
 
 
@@ -100,7 +102,18 @@ def load_checkpoint(model, checkpoint_path, strict=True):
     if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
         state_dict = convert_to_custom_text_state_dict(state_dict)
     resize_pos_embed(state_dict, model)
-    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+
+    incompatible_keys = []
+    # TODO: find better way of doing this
+    if isinstance(model, VideoCLIP):
+        text_state_dict = dict([(k[len("text."):], v) for (k, v) in state_dict.items() if k.startswith("text")])
+        visual_state_dict = dict([(k[len("visual."):], v) for (k, v) in state_dict.items() if k.startswith("visual")])
+
+        incompatible_keys += model.text.load_state_dict(text_state_dict, strict=strict)
+        incompatible_keys += model.visual.spatial.load_state_dict(visual_state_dict, strict=strict)
+    else:
+        incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+
     return incompatible_keys
 
 
@@ -191,11 +204,20 @@ def create_model(
             else:
                 model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
         else:
-            model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+            if "ViViT" in model_name: # TODO better way of detecting video configs
+                model = VideoCLIP(**model_cfg)
+            else:
+                model = CLIP(**model_cfg, cast_dtype=cast_dtype)
 
         pretrained_loaded = False
         if pretrained:
             checkpoint_path = ''
+
+            # TODO: not sure how to initialize components nicely
+            # idea for now: model_name:pretrained
+            if ":" in pretrained:
+                model_name, pretrained = pretrained.split(":")
+
             pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
             if pretrained_cfg:
                 checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
@@ -305,21 +327,40 @@ def create_model_and_transforms(
         output_dict=output_dict,
     )
 
-    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
-    image_std = image_std or getattr(model.visual, 'image_std', None)
-    preprocess_train = image_transform(
-        model.visual.image_size,
-        is_train=True,
-        mean=image_mean,
-        std=image_std,
-        aug_cfg=aug_cfg,
-    )
-    preprocess_val = image_transform(
-        model.visual.image_size,
-        is_train=False,
-        mean=image_mean,
-        std=image_std,
-    )
+    # TODO: better way of getting modality specific transforms
+    if "ViViT" in model_name:
+        preprocess_train = video_transform( 
+            frame_size=model.visual.spatial.image_size,
+            n_frames=model.visual.context_length,
+            take_every_nth=5,
+            is_train=False, # TODO: figre out if frame augmentations make sense
+            frame_mean=None,
+            frame_std=None,
+        )
+        preprocess_val = video_transform(
+            frame_size=model.visual.spatial.image_size,
+            n_frames=model.visual.context_length,
+            take_every_nth=5,
+            is_train=False,
+            frame_mean=None,
+            frame_std=None,
+        )
+    else:
+        image_mean = image_mean or getattr(model.visual, 'image_mean', None)
+        image_std = image_std or getattr(model.visual, 'image_std', None)
+        preprocess_train = image_transform(
+            model.visual.image_size,
+            is_train=True,
+            mean=image_mean,
+            std=image_std,
+            aug_cfg=aug_cfg,
+        )
+        preprocess_val = image_transform(
+            model.visual.image_size,
+            is_train=False,
+            mean=image_mean,
+            std=image_std,
+        )
 
     return model, preprocess_train, preprocess_val
 

diff --git a/src/open_clip/model_configs/ViViT-B-32.json b/src/open_clip/model_configs/ViViT-B-32.json
@@ -0,0 +1,24 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    },
+    "temporal_cfg": {
+        "context_length": 32,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+	"mlp_ratio": 4,
+	"pooler_type": "cls_pooler"
+    }
+}
diff --git a/src/open_clip/model_configs/ViViT-B-32_short.json b/src/open_clip/model_configs/ViViT-B-32_short.json
@@ -0,0 +1,24 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    },
+    "temporal_cfg": {
+        "context_length": 8,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "mlp_ratio": 4,
+        "pooler_type": "cls_pooler"
+    }
+}
diff --git a/src/open_clip/model_configs/ViViT-L-14_short.json b/src/open_clip/model_configs/ViViT-L-14_short.json
@@ -0,0 +1,24 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "temporal_cfg": {
+        "context_length": 8,
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "mlp_ratio": 4,
+        "pooler_type": "cls_pooler"
+    }
+}
diff --git a/src/open_clip/transform.py b/src/open_clip/transform.py
@@ -7,7 +7,7 @@
 import torchvision.transforms.functional as F
 
 from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
-    CenterCrop
+    CenterCrop, ToPILImage
 
 from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 
@@ -131,3 +131,70 @@ def image_transform(
             normalize,
         ])
         return Compose(transforms)
+
+
+# TODO: needs improvmenet
+def video_transform(
+        frame_size: int,
+        n_frames: int,
+        take_every_nth: int,
+        is_train: bool,
+        frame_mean: Optional[Tuple[float, ...]] = None,
+        frame_std: Optional[Tuple[float, ...]] = None,
+    ):
+
+    frame_mean = frame_mean or OPENAI_DATASET_MEAN
+    if not isinstance(frame_mean, (list, tuple)):
+        frame_mean = (frame_mean,) * 3
+
+    frame_std = frame_std or OPENAI_DATASET_STD
+    if not isinstance(frame_std, (list, tuple)):
+        frame_std = (frame_std,) * 3
+
+    normalize = Normalize(mean=frame_mean, std=frame_std) 
+
+    if is_train:
+        transforms = [
+            ToPILImage(),
+            RandomResizedCrop(
+                frame_size,
+                scale=(0.9, 0.1),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ]
+    else:
+        transforms = [
+            ToPILImage(),
+            Resize(frame_size, interpolation=InterpolationMode.BICUBIC),
+            CenterCrop(frame_size),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ]
+
+    frame_transform = Compose(transforms)
+    def apply_frame_transform(sample):
+        video, audio, video_meta = sample
+        video = video.permute(0, 3, 1, 2)
+
+        video = video[::take_every_nth]
+        video = video[:n_frames] # TODO: maybe make this middle n frames
+
+        # TODO: maybe padding isn't the way to go
+        # TODO: also F.pad is acting up for some reason
+        # isn't letting me input a len 8 tuple for 4d tnesor???
+        # video = F.pad(video, tuple([0, 0]*len(video.shape[-3:]) + [0, n_frames - video.shape[0]]))
+
+        if video.shape[0] < n_frames:
+            padded_video = torch.zeros(n_frames, *video.shape[1:])
+            padded_video[:video.shape[0]] = video
+            video = padded_video
+
+        # TODO: this .float() is weird, look how this is done in other places
+        return torch.cat([frame_transform(frame.float())[None, ...] for frame in video])
+
+
+    return apply_frame_transform
diff --git a/src/open_clip/transformer.py b/src/open_clip/transformer.py
@@ -497,7 +497,7 @@ def forward(self, x: torch.Tensor):
 
         if self.output_tokens:
             return pooled, tokens
-        
+
         return pooled