micropose-detection

5421f53b · kayef.ahamad · 524de3f4 · 5421f53b · 5421f53b · 5421f53b
Commit 5421f53b authored Aug 01, 2023 by kayef.ahamad
264 changed files
--- a/Arial.ttf
+++ b/Arial.ttf
--- a/Dockerfile
+++ b/Dockerfile
+FROM python:3.8
+
+RUN apt-get update
+RUN apt-get install tzdata vim -y
+RUN apt-get update && apt-get install tzdata ffmpeg libsm6 libxext6  -y
+
+RUN pip3 install --upgrade pip
+RUN pip3 install matplotlib>=3.2.2 tensorboard>=2.4.1 numpy>=1.18.5 opencv-python>=4.1.2 Pillow>=7.1.2 PyYAML>=5.3.1 requests>=2.23.0 scipy>=1.4.1 tqdm>=4.41.0 pandas seaborn expiringdict minio cachetools
+RUN pip3 install pymongo Cython paho-mqtt==1.5.0
+ADD . /app
+ADD Arial.ttf /root/.config/Ultralytics/
+WORKDIR /app
+COPY data/kinari.mp4 data/
+COPY data/labels.txt data/
+CMD ["python3","app.py"]
--- a/README.md
+++ b/README.md
-# Micropose Detection
+**Micropose detection using action recognition**
+
+Model weight files can be downloaded from following links and needs to be placed inside data folder : 
+[https://azrmlilensqa006382180551.blob.core.windows.net/mlflow-vm-container/shikhin/gokaldas_image_bkp/app/data/resnet34-333f7ec4.pth](https://azrmlilensqa006382180551.blob.core.windows.net/mlflow-vm-container/shikhin/gokaldas_image_bkp/app/data/resnet34-333f7ec4.pth)
+ and [https://azrmlilensqa006382180551.blob.core.windows.net/mlflow-vm-container/shikhin/gokaldas_image_bkp/app/data/save_10.pth](https://azrmlilensqa006382180551.blob.core.windows.net/mlflow-vm-container/shikhin/gokaldas_image_bkp/app/data/save_10.pth)

-Micropose Detection
\ No newline at end of file
--- a/__init__.py
+++ b/__init__.py
--- a/action_recognition/__init__.py
+++ b/action_recognition/__init__.py
--- a/action_recognition/annotation.py
+++ b/action_recognition/annotation.py
+import cv2
+
+from .utils import load_json, load_value_file
+import os
+
+
+def get_video_names_and_annotations(data, subset):
+    """Selects clips of a given subset from the parsed json annotation"""
+    video_names = []
+    annotations = []
+
+    for key, value in data['database'].items():
+        this_subset = value['subset']
+        if this_subset == subset:
+            label = value['annotations']['label']
+            video_names.append('{}/{}'.format(label, key))
+            annotations.append(value)
+
+    return video_names, annotations
+
+
+def get_video_props(video_path, video_format, annotation):
+    """Tries to read video properties (total number of frames and FPS) from annotation
+    file or read it from file otherwise"""
+
+    n_frames = annotation.get('n_frames')
+    fps = annotation.get('fps')
+    if n_frames and fps:
+        return n_frames, fps
+
+    if video_format == 'frames':
+        if not os.path.exists(video_path):
+            return 0, 0
+        n_frames = int(load_value_file(video_path +'/'+ 'n_frames'))
+        fps = 30
+    else:
+        cap = cv2.VideoCapture(video_path.as_posix())
+        n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+    return n_frames, fps
+
+
+def load_json_annotation(root_path, annotation_path, subset, flow_path=None, video_format='frames'):
+    """Load annotation in ActivityNet-like format"""
+    data = load_json(annotation_path)
+    video_names, annotations = get_video_names_and_annotations(data, subset)
+
+    idx_to_class = dict(enumerate(data['labels']))
+    class_to_idx = {v: k for k, v in idx_to_class.items()}
+
+    videos = []
+    for i, (video_name, annotation) in enumerate(zip(video_names, annotations)):
+        if i % 1000 == 0:
+            print('dataset loading [{}/{}]'.format(i, len(video_names)))
+
+        if video_format == 'video' and not video_name.lower().endswith('.mp4'):
+            video_name += '.mp4'
+
+        video_path = str(root_path) +'/'+ video_name
+
+        n_frames, fps = get_video_props(video_path, video_format, annotation)
+
+        if n_frames == 0:
+            continue
+
+        flow_full_path = flow_path
+        if flow_path is not None:
+            flow_full_path = (flow_path / video_name).as_posix()
+
+        begin_t = 1
+        end_t = n_frames
+        sample = {
+            'video': video_path,
+            'flow': flow_full_path,
+            'segment': [begin_t, end_t],
+            'n_frames': n_frames,
+            'fps': fps,
+            'video_id': video_name.split('/')[1],
+            'label': class_to_idx[annotation['annotations']['label']],
+        }
+        videos.append(sample)
+
+    return videos, idx_to_class
--- a/action_recognition/dataset.py
+++ b/action_recognition/dataset.py
+import copy
+from collections import Counter
+
+import numpy as np
+import torch
+from torch.utils import data
+
+from action_recognition.utils import cached
+from action_recognition.video_reader import make_video_reader, read_flow
+from .annotation import load_json_annotation
+
+
+@cached()
+def load_annotation(annotation_path, flow_path, root_path, subset, video_format):
+    return load_json_annotation(root_path, annotation_path, subset, flow_path, video_format)
+
+
+def make_dataset(args, subset, spatial_transform, temporal_transform, target_transform):
+    """Constructs VideoDataset instance for specified subset"""
+    assert subset in ['training', 'validation', 'testing']
+
+    if subset == 'testing':
+        num_samples_per_video = args.n_test_clips
+    elif subset == 'validation':
+        num_samples_per_video = args.n_val_clips
+    else:  # train
+        num_samples_per_video = 1
+
+    if subset == 'testing':
+        if args.test_subset == 'val':
+            subset = 'validation'
+        elif args.test_subset == 'test':
+            subset = 'testing'
+
+    return_flow = False
+    return_rgb = True
+    if "flow" in args.model:
+        if "two_stream" not in args.model:
+            return_rgb = False
+        return_flow = True
+
+    return VideoDataset(
+        args.video_path,
+        args.annotation_path,
+        subset,
+        num_samples_per_video,
+        spatial_transform,
+        temporal_transform,
+        target_transform,
+        sample_duration=(args.sample_duration * args.temporal_stride),
+        flow_path=args.flow_path,
+        return_rgb=return_rgb,
+        return_flow=return_flow,
+        video_format=getattr(args, 'video_format', None),
+        image_reader=getattr(args, 'image_reader', "opencv")
+    )
+
+
+def sample_clips(videos, num_samples_per_video, sample_duration):
+    """Extracts clips with given length from each video.
+    Args:
+        videos: List of video samples
+        num_samples_per_video: Number of clips sampled for each video. If num_samples_per_video = 1 ,
+            then all frames are returned for each video. If num_samples_per_video <= 0, then all sequential
+            clips are sampled from video. If num_samples_per_video > 1, then clips are sampled uniformly.
+        sample_duration: Number of frames in sampled clips. Actual value may be smaller for short clips.
+
+    Returns: List of clip samples
+
+    """
+    videos = sorted(videos, key=lambda v: v['video'].split('/')[-1])
+    clips = []
+    for sample in videos:
+        segment_start, segment_end = sample['segment']
+        n_frames = segment_end - segment_start + 1
+
+        if num_samples_per_video == 1:
+            # use all frames from video
+            sample['frame_indices'] = list(range(segment_start, segment_end + 1))
+            clips.append(sample)
+
+        else:
+
+            if num_samples_per_video == 0:
+                # use all sequential clips with sample_duration
+                step = sample_duration
+            else:
+                step = max(1, (n_frames - sample_duration) // (num_samples_per_video - 1))
+
+            for clip_start in range(segment_start, segment_start + step * num_samples_per_video, step):
+                sampled_clip = copy.deepcopy(sample)
+                clip_end = min(segment_end + 1, clip_start + sample_duration)
+                sampled_clip['frame_indices'] = list(range(clip_start, clip_end))
+                if sampled_clip['frame_indices']:
+                    clips.append(sampled_clip)
+
+    return clips
+
+
+class VideoDataset(data.Dataset):
+    """Generic video dataset.
+
+    Args:
+        video_path (Path): Directory with video files. Will be used by annotation_loader to resolve real paths.
+        annotation_path (Path): Path to annotation file.
+        subset (str): Which subset of dataset to use (validation/training/testing)
+        annotation_format (str): Format of the annotation file.
+        n_samples_for_each_video (int): How many clips should be sampled from every video per epoch.
+        spatial_transform (callable): A function/transform that takes in a clip (list of frames) and returns
+            transformed version
+        temporal_transform (callable): A function/transform that takes a list of clip frames and returns transformed
+            version
+        target_transform (callable): A function/transform that takes in the annotation object and returns transformed
+            version
+        sample_duration (int): Number of frames in sampled clips
+        flow_path (Path): Path to a optical flow directory
+        return_rgb (bool): Whether RGB frame should be returned
+        return_flow (bool): Whether Optical flow should be returned
+        video_reader (callable): Callable that takes in a path to video and transformed frame indices and returns
+            list of frames. If None, then object will be created according to the video_format.
+        video_format (str): Type of video_loader to be instantiated. If "video", then created video_loader will
+            attempt to read frames from .mp4 file. If "frames", then it will try to read from directory with images
+        image_reader (str): Backend for reading image files (pil, opencv, accimage)
+    """
+
+    def __init__(
+            self,
+            video_path,
+            annotation_path,
+            subset,
+            n_samples_for_each_video=1,
+            spatial_transform=None,
+            temporal_transform=None,
+            target_transform=None,
+            sample_duration=16,
+            flow_path=None,
+            return_rgb=True,
+            return_flow=False,
+            video_reader=None,
+            video_format='frames',
+            image_reader='opencv'
+    ):
+        if not video_reader:
+            self.video_loader = make_video_reader(video_format, image_reader)
+        else:
+            self.video_loader = video_reader
+
+        self.data, self.class_names = load_annotation(annotation_path, flow_path, video_path, subset, video_format)
+
+        if not self.data:
+            raise ValueError("No videos found in {!s} directory. Please check correctness of provided paths"
+                             .format(video_path))
+
+        self.data = sample_clips(self.data, n_samples_for_each_video, sample_duration)
+
+        self.spatial_transform = spatial_transform
+        self.temporal_transform = temporal_transform
+        self.target_transform = target_transform
+        self.return_rgb = return_rgb
+        self.return_flow = return_flow
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            clips (dict): Dictionary where keys are input modalities and values are corresponding tensors
+            targets (dict): Dictionary with annotation data (label, video_id, etc)
+        """
+        clip_index = index // len(self.spatial_transform)
+        spatial_transform_index = index % len(self.spatial_transform)
+
+        self.spatial_transform[spatial_transform_index].randomize_parameters()
+
+        frame_indices = self.data[clip_index]['frame_indices']
+        if self.temporal_transform is not None:
+            frame_indices = self.temporal_transform(frame_indices)
+
+        clips = {
+            **self._read_rgb(clip_index, frame_indices, spatial_transform_index),
+            **self._read_flow(clip_index, frame_indices, spatial_transform_index)
+        }
+
+        target = self.data[clip_index]
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return clips, target
+
+    def _read_rgb(self, clip_index, frames, spatial_transform_index):
+        if not self.return_rgb:
+            return {}
+
+        video_path = self.data[clip_index]['video']
+        clip = self.video_loader(str(video_path), frames)
+
+        clip = [self.spatial_transform[spatial_transform_index](frame) for frame in clip]
+
+        return {'rgb_clip': torch.stack(clip, 0)}
+
+    def _read_flow(self, clip_index, frames, spatial_transform_index):
+        if not self.return_flow:
+            return {}
+
+        flow_path = self.data[clip_index]['flow']
+        clip = read_flow(str(flow_path), frames)
+
+        clip = [self.spatial_transform[spatial_transform_index](frame) for frame in clip]
+
+        clip = torch.stack(clip, 0)
+        N, _, H, W = clip.shape
+        clip = clip.view((N // 2, 2, H, W))
+        return {'flow_clip': clip}
+
+    def __len__(self):
+        return len(self.data) * len(self.spatial_transform)
+
+    def get_sample_weights(self, class_weights):
+        """Transforms per-class sampling probability to per-clip sampling probability.
+        Used with torch.utils.data.WeightedRandomSampler in order to balance classes"""
+        if class_weights is not None:
+            class_weights = np.asarray(class_weights)
+            class_weights /= np.sum(class_weights)
+        else:
+            num_labels = len(self.class_names)
+            sample_count = Counter(data_elem['label'] for data_elem in self.data)
+            class_weights = [(1 / sample_count[label]) / num_labels for label in range(num_labels)]
+
+        return [class_weights[data_elem['label']] for data_elem in self.data]
--- a/action_recognition/logging.py
+++ b/action_recognition/logging.py
--- a/action_recognition/loss.py
+++ b/action_recognition/loss.py
+from torch.nn import functional as F
+from torch import nn
+
+import torch
+
+from .model import create_model
+from .utils import load_state
+
+
+class LogitKLDivLoss(nn.Module):
+    """Kullback–Leibler divergence loss. Inputs predicted and ground truth logits.
+
+    Args:
+        T (float): Softmax temperature.
+    """
+
+    def __init__(self, T=1):
+        super().__init__()
+        self.T = T
+
+    def forward(self, p_logits, q_logits, **kwargs):
+        log_p = F.log_softmax(p_logits / self.T, dim=1)
+        q = F.softmax(q_logits / self.T, dim=1)
+        return F.kl_div(log_p, q, reduction='batchmean') * self.T ** 2
+
+
+class DistillationLoss(nn.Module):
+    """Knowledge distillation loss.
+
+    Args:
+        teacher_model (torch.nn.Module): Model that will be used for supervision.
+        T (float): Softmax temperature.
+    """
+
+    def __init__(self, teacher_model, T=1):
+        super().__init__()
+        self.teacher_model = teacher_model
+        self.kl_div = LogitKLDivLoss(T)
+
+    def forward(self, outputs, inputs, **kwargs):
+        """
+        Args:
+            outputs: Predicted student model logits
+            inputs: Inputs that have been used to produce outputs.
+        """
+        with torch.no_grad():
+            teacher_logits = self.teacher_model(*inputs)
+        return self.kl_div(outputs, teacher_logits)
+
+
+class SoftmaxLoss(nn.Module):
+    """Classification loss"""
+
+    def forward(self, outputs, targets, **kwargs):
+        return F.cross_entropy(outputs, targets)
+
+
+class WeightedSumLoss(nn.Module):
+    """Aggregate multiple loss functions in one weighted sum."""
+
+    def __init__(self, normalize=False):
+        super().__init__()
+        self.normalize = normalize
+        self.losses = nn.ModuleDict()
+        self.weights = {}
+        self.values = {}
+
+    def forward(self, outputs, **kwargs):
+        total_loss = outputs.new(1).zero_()
+        for loss in self.losses:
+            loss_val = self.losses[loss](outputs=outputs, **kwargs)
+            total_loss += self.weights[loss] * loss_val
+            self.values[loss] = loss_val
+
+        if self.normalize:
+            total_loss /= sum(self.weights.values())
+
+        return total_loss
+
+    def add_loss(self, name, loss, weight=1.0):
+        self.weights[name] = weight
+        self.losses.add_module(name, loss)
+
+
+def create_criterion(args):
+    criterion = WeightedSumLoss()
+    softmax = SoftmaxLoss()
+    criterion.add_loss('softmax', softmax)
+
+    if args.teacher_model:
+        teacher_model, _ = create_model(args, args.teacher_model)
+
+        checkpoint = torch.load(str(args.teacher_checkpoint))
+        load_state(teacher_model, checkpoint['state_dict'])
+        teacher_model.eval()
+
+        distillation_loss = DistillationLoss(teacher_model, T=8)
+        criterion.add_loss(distillation_loss, 0.4)
+
+    return criterion
--- a/action_recognition/model.py
+++ b/action_recognition/model.py
+import torch
+from torch import nn
+
+from .models import (densenet_3d, inception_i3d, lstm_attention,
+                     multi_frame_baseline, video_transformer, vtn_motion,
+                     vtn_two_stream)
+from .models.modules.sync_batchnorm import (DataParallelWithCallback,
+                                            SynchronizedBatchNorm2d)
+from .models.r3d import R3D_MODELS
+from .utils import load_state
+
+MODEL_REGISTRY = {
+    'vtn': lambda args, encoder: video_transformer.VideoTransformer(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        layer_norm=args.layer_norm,
+
+    ),
+    'lstm': lambda args, encoder: lstm_attention.VisualAttentionLSTM(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        use_attention=False,
+        bidirectional=args.bidirectional_lstm
+    ),
+    'attn_lstm': lambda args, encoder: lstm_attention.VisualAttentionLSTM(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        use_attention=True,
+    ),
+
+    'vtn_rgbdiff': lambda args, encoder: vtn_motion.VideoTransformerMotion(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        mode='rgbdiff',
+        layer_norm=args.layer_norm,
+    ),
+
+    'vtn_flow': lambda args, encoder: vtn_motion.VideoTransformerMotion(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        mode='flow',
+        layer_norm=args.layer_norm,
+    ),
+
+    'vtn_two_stream': lambda args, encoder: vtn_two_stream.VideoTransformerTwoStream(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        motion_path=args.motion_path,
+        rgb_path=args.rgb_path,
+        mode='rgbdiff',
+        layer_norm=args.layer_norm,
+    ),
+
+    'vtn_two_stream_flow': lambda args, encoder: vtn_two_stream.VideoTransformerTwoStream(
+        args.hidden_size,
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        motion_path=args.motion_path,
+        rgb_path=args.rgb_path,
+        mode='flow'
+    ),
+
+    'baseline': lambda args, encoder: multi_frame_baseline.MultiFrameBaseline(
+        args.sample_duration,
+        encoder,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True
+    ),
+
+    'resnet34_attn_single': lambda args, encoder: lstm_attention.ResnetAttSingleInput(
+        args.hidden_size,
+        args.sample_duration,
+        args.n_classes,
+        args.sample_size,
+        False if args.pretrain_path or args.resume_path else True,
+        resnet_size=34
+    ),
+
+    'inception_i3d': lambda args, encoder: inception_i3d.InceptionI3D(
+        num_classes=args.n_classes
+    ),
+    'densenet201': lambda args, encoder: densenet_3d.densenet201(
+        sample_size=args.sample_size,
+        sample_duration=args.sample_duration,
+        num_classes=400
+    )
+    ,
+    **R3D_MODELS,
+}
+
+
+def make_bn_synchronized(bn_module):
+    """Convert all BatchNorm modules to SynchronizedBatchNorm"""
+    new_module = SynchronizedBatchNorm2d(bn_module.num_features, eps=bn_module.eps, momentum=bn_module.momentum,
+                                         affine=bn_module.affine)
+
+    if new_module.track_running_stats:
+        new_module.running_mean = bn_module.running_mean
+        new_module.running_var = bn_module.running_var
+        new_module.num_batches_tracked = bn_module.num_batches_tracked
+    new_module.weight = bn_module.weight
+    new_module.bias = bn_module.bias
+    return new_module
+
+
+def _replace_bns(model: nn.Module, memo=None):
+    if memo is None:
+        memo = set()
+    if model not in memo:
+        memo.add(model)
+        for name, module in model._modules.items():
+            if module is None:
+                continue
+            if isinstance(module, nn.BatchNorm2d):
+                if isinstance(model, nn.Sequential):
+                    model._modules[name] = make_bn_synchronized(module)
+                else:
+                    setattr(model, name, make_bn_synchronized(module))
+            _replace_bns(module, memo)
+    return model
+
+
+def create_model(args, model, pretrain_path=None):
+    """Construct model with a given name and args.
+
+    Args:
+        args (Namespace): Options for model construction
+        model (str): Name of the model in ENCODER_DECODER or DECODER format.
+        pretrain_path (Path): Path to a checkpoint with the pretrained model.
+    """
+    model = model.replace("self_attn", "vtn")
+    if len(model.split('_')) > 1:
+        encoder_name, model_type = model.split('_', 1)
+    else:
+        encoder_name = args.encoder
+        model_type = model
+    encoder_name = encoder_name.replace('-', '_')
+
+    if model in MODEL_REGISTRY:
+        # if model with exactly same name is known
+        model = MODEL_REGISTRY[model](args, encoder_name)
+    else:
+        model = MODEL_REGISTRY[model_type](args, encoder_name)
+
+    # load pre-trained model
+    if pretrain_path:
+        print('loading pretrained model {}'.format(args.pretrain_path))
+        pretrain = torch.load(str(args.pretrain_path))
+
+        if hasattr(model, 'load_checkpoint'):
+            model.load_checkpoint(pretrain['state_dict'])
+        else:
+            load_state(model, pretrain['state_dict'])
+
+    if args.cuda:
+        model = model.cuda()
+        if args.sync_bn:
+            model = _replace_bns(model)
+            wrapped_model = DataParallelWithCallback(model)
+        else:
+            wrapped_model = nn.DataParallel(model)
+    else:
+        wrapped_model = model
+
+    if args.fp16:
+        model = model.half()
+
+        # do not train batchnorms in FP16 precision
+        def _float_bns(layer):
+            if isinstance(layer, (nn.BatchNorm2d,)):
+                layer.float()
+
+        model.apply(_float_bns)
+
+    parameters = model.trainable_parameters()
+    return wrapped_model, parameters
--- a/action_recognition/models/__init__.py
+++ b/action_recognition/models/__init__.py
--- a/action_recognition/models/backbone/__init__.py
+++ b/action_recognition/models/backbone/__init__.py
+from collections import namedtuple
+
+from torch import nn
+
+from . import resnet
+from . import mobilenetv2
+from . import rmnet
+
+Encoder = namedtuple('Encoder', ('model', 'features', 'features_shape'))
+
+
+def make_encoder(name, input_size=224, input_channels=3, pretrained=None):
+    """Make encoder (backbone) with a given name and parameters"""
+    
+    features_size = input_size // 32
+    num_features = 2048
+    if name.startswith('resnet'):
+        model = getattr(resnet, name)(pretrained=pretrained, num_channels=input_channels)
+        features = nn.Sequential(*list(model.children())[:-2])
+        num_features = 512 if int(name[6:]) < 50 else 2048
+    elif name.startswith('mobilenetv2'):
+        model = mobilenetv2.MobileNetV2(input_size=input_size, pretrained=None)
+        features = model.features
+        num_features = 1280
+    elif name.startswith('rmnet'):
+        model = rmnet.RMNetClassifier(1000, pretrained=None)
+        features = nn.Sequential(*list(model.children())[:-2])
+        num_features = 512
+    elif name.startswith('se_res'):
+        model = load_from_pretrainedmodels(name)(pretrained='imagenet' if pretrained else None)
+        features = nn.Sequential(*list(model.children())[:-2])
+    else:
+        raise KeyError("Unknown model name: {}".format(name))
+
+    features_shape = (num_features, features_size, features_size)
+    return Encoder(model, features, features_shape)
+
+
+def load_from_pretrainedmodels(model_name):
+    import pretrainedmodels
+    return getattr(pretrainedmodels, model_name)
--- a/action_recognition/models/backbone/mobilenetv2.py
+++ b/action_recognition/models/backbone/mobilenetv2.py
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU()
+    )
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU()
+    )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1., pretrained=None):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+
+        if pretrained:
+            checkpoint = torch.load(pretrained)
+            self.load_state_dict(checkpoint)
+        else:
+            self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = F.avg_pool2d(x, 7).view(-1, 1280)
+        # x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
--- a/action_recognition/models/backbone/resnet.py
+++ b/action_recognition/models/backbone/resnet.py
+import collections
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+from ...utils import drop_last
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000, num_channels=3):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def chek_conv1_params(model, pretrained_weights):
+    if model.conv1.in_channels != pretrained_weights['conv1.weight'].size(1):
+        # get mean over RGB channels weights
+        rgb_mean = torch.mean(pretrained_weights['conv1.weight'], dim=1)
+
+        expand_ratio = model.conv1.in_channels // pretrained_weights['conv1.weight'].size(1)
+        pretrained_weights['conv1.weight'] = pretrained_weights['conv1.weight'].repeat(1, expand_ratio, 1, 1)
+        # pretrained_weights['conv1.weight'] = rgb_mean.unsqueeze(1).repeat(1, model.conv1.in_channels, 1, 1)
+
+
+def average_conv1_weights(old_params, in_channels):
+    new_params = collections.OrderedDict()
+    layer_count = 0
+    all_key_list = old_params.keys()
+    for layer_key in drop_last(all_key_list, 2):
+        if layer_count == 0:
+            rgb_weight = old_params[layer_key]
+            rgb_weight_mean = torch.mean(rgb_weight, dim=1)
+            flow_weight = rgb_weight_mean.unsqueeze(1).repeat(1, in_channels, 1, 1)
+            if isinstance(flow_weight, torch.autograd.Variable):
+                new_params[layer_key] = flow_weight.data
+            else:
+                new_params[layer_key] = flow_weight
+            layer_count += 1
+        else:
+            new_params[layer_key] = old_params[layer_key]
+            layer_count += 1
+
+    return new_params
+
+
+def load_pretrained_resnet(model, resnet_name='resnet34', num_channels=3):
+    if num_channels == 3:
+        pretrained_weights = model_zoo.load_url(model_urls[resnet_name])
+        chek_conv1_params(model, pretrained_weights)
+        model.load_state_dict(pretrained_weights)
+    else:
+        pretrained_dict = model_zoo.load_url(model_urls[resnet_name])
+        model_dict = model.state_dict()
+
+        new_pretrained_dict = average_conv1_weights(pretrained_dict, num_channels)
+
+        # 1. filter out unnecessary keys
+        new_pretrained_dict = {k: v for k, v in new_pretrained_dict.items() if k in model_dict}
+        # 2. overwrite entries in the existing state dict
+        model_dict.update(new_pretrained_dict)
+        # 3. load the new state dict
+        model.load_state_dict(model_dict)
+    return model
+
+
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+
+    num_channels = 3
+    if 'num_channels' in kwargs:
+        num_channels = kwargs['num_channels']
+    if pretrained:
+        model = load_pretrained_resnet(model, 'resnet18', num_channels)
+    return model
+
+
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    num_channels = 3
+    if 'num_channels' in kwargs:
+        num_channels = kwargs['num_channels']
+    if pretrained:
+        model = load_pretrained_resnet(model, 'resnet34', num_channels)
+    return model
+
+
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    num_channels = 3
+    if 'num_channels' in kwargs:
+        num_channels = kwargs['num_channels']
+    if pretrained:
+        model = load_pretrained_resnet(model, 'resnet50', num_channels)
+    return model
+
+
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    num_channels = 3
+    if 'num_channels' in kwargs:
+        num_channels = kwargs['num_channels']
+    if pretrained:
+        model = load_pretrained_resnet(model, 'resnet101', num_channels)
+    return model
+
+
+def resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    num_channels = 3
+    if 'num_channels' in kwargs:
+        num_channels = kwargs['num_channels']
+    if pretrained:
+        model = load_pretrained_resnet(model, 'resnet152', num_channels)
+    return model
--- a/action_recognition/models/backbone/rmnet.py
+++ b/action_recognition/models/backbone/rmnet.py
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...utils import load_state
+
+
+class RMBlock(nn.Module):
+    def __init__(self, input_planes, squeeze_planes, output_planes, downsample=False, dropout_ratio=0.1,
+                 activation=nn.ELU):
+        super(RMBlock, self).__init__()
+        self.downsample = downsample
+        self.input_planes = input_planes
+        self.output_planes = output_planes
+
+        self.squeeze_conv = nn.Conv2d(input_planes, squeeze_planes, kernel_size=1, bias=False)
+        self.squeeze_bn = nn.BatchNorm2d(squeeze_planes)
+
+        self.dw_conv = nn.Conv2d(squeeze_planes, squeeze_planes, groups=squeeze_planes, kernel_size=3, padding=1,
+                                 stride=2 if downsample else 1, bias=False)
+        self.dw_bn = nn.BatchNorm2d(squeeze_planes)
+
+        self.expand_conv = nn.Conv2d(squeeze_planes, output_planes, kernel_size=1, bias=False)
+        self.expand_bn = nn.BatchNorm2d(output_planes)
+
+        self.activation = activation(inplace=True)
+        self.dropout_ratio = dropout_ratio
+
+        if self.downsample:
+            self.skip_conv = nn.Conv2d(input_planes, output_planes, kernel_size=1, bias=False)
+            self.skip_conv_bn = nn.BatchNorm2d(output_planes)
+
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.children():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant(m.weight, 1)
+                nn.init.constant(m.bias, 0)
+
+    def forward(self, x):
+        residual = x
+        out = self.activation(self.squeeze_bn(self.squeeze_conv(x)))
+        out = self.activation(self.dw_bn(self.dw_conv(out)))
+        out = self.expand_bn(self.expand_conv(out))
+        if self.dropout_ratio > 0:
+            out = F.dropout(out, p=self.dropout_ratio, training=self.training, inplace=True)
+        if self.downsample:
+            residual = F.max_pool2d(x, kernel_size=2, stride=2, padding=0)
+            residual = self.skip_conv(residual)
+            residual = self.skip_conv_bn(residual)
+        out += residual
+        return self.activation(out)
+
+
+class RMNetBody(nn.Module):
+    def __init__(self, block=RMBlock, blocks_per_stage=(None, 4, 8, 10, 11), trunk_width=(32, 32, 64, 128, 256),
+                 bottleneck_width=(None, 8, 16, 32, 64)):
+        super(RMNetBody, self).__init__()
+        assert len(blocks_per_stage) == len(trunk_width) == len(bottleneck_width)
+        self.dim_out = trunk_width[-1]
+
+        stages = [nn.Sequential(OrderedDict([
+            ('data_bn', nn.BatchNorm2d(3)),
+            ('conv1', nn.Conv2d(3, trunk_width[0], kernel_size=3, stride=2, padding=1, bias=False)),
+            ('bn1', nn.BatchNorm2d(trunk_width[0])),
+            ('relu1', nn.ReLU(inplace=True))
+        ])),
+        ]
+        for i, (blocks_num, w, wb) in enumerate(zip(blocks_per_stage, trunk_width, bottleneck_width)):
+            # Zeroth stage is already added.
+            if i == 0:
+                continue
+            stage = []
+            # Do not downscale input to the first stage.
+            if i > 1:
+                stage.append(block(trunk_width[i - 1], wb, w, downsample=True))
+            for _ in range(blocks_num):
+                stage.append(block(w, wb, w))
+            stages.append(nn.Sequential(*stage))
+
+        self.stages = nn.Sequential(OrderedDict([
+            ('stage_{}'.format(i), stage) for i, stage in enumerate(stages)
+        ]))
+
+        self.init_weights()
+
+    def init_weights(self):
+        m = self.stages[0][0]  # ['data_bn']
+        nn.init.constant(m.weight, 1)
+        nn.init.constant(m.bias, 0)
+        m = self.stages[0][1]  # ['conv1']
+        nn.init.kaiming_normal(m.weight, mode='fan_out')
+        m = self.stages[0][2]  # ['bn1']
+        nn.init.constant(m.weight, 1)
+        nn.init.constant(m.bias, 0)
+        # All other blocks should be initialized internally during instantiation.
+
+    def forward(self, x):
+        return self.stages(x)
+
+
+class RMNetClassifier(nn.Module):
+    def __init__(self, num_classes, body=RMNetBody, dropout_ratio=0.1, pretrained=None):
+        super(RMNetClassifier, self).__init__()
+        self.dropout_ratio = dropout_ratio
+        self.backbone = body()
+        self.extra_conv_bn_relu = nn.Sequential(nn.Conv2d(256, 512, 3, stride=2, padding=1, bias=False),
+                                                nn.BatchNorm2d(512), nn.ELU())
+        self.extra_conv_bn_relu_2 = nn.Sequential(nn.Conv2d(512, 1024, 3, stride=2, padding=1, bias=False),
+                                                  nn.BatchNorm2d(1024), nn.ReLU())
+        self.fc = nn.Conv2d(1024, num_classes, 1, stride=1, padding=0)
+
+        if pretrained:
+            checkpoint = torch.load(pretrained)
+            load_state(self, checkpoint)
+            # self.load_state_dict(checkpoint)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        x = self.extra_conv_bn_relu(x)
+        x = self.extra_conv_bn_relu_2(x)
+        x = F.avg_pool2d(x, (4, 4))
+        x = self.fc(x)
+        x = x.view(-1, x.size(1))
+        return x
--- a/action_recognition/models/densenet_3d.py
+++ b/action_recognition/models/densenet_3d.py
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from action_recognition.utils import get_fine_tuning_parameters
+
+__all__ = [
+    'DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264'
+]
+
+
+def densenet121(**kwargs):
+    model = DenseNet(
+        num_init_features=64,
+        growth_rate=32,
+        block_config=(6, 12, 24, 16),
+        **kwargs)
+    return model
+
+
+def densenet169(**kwargs):
+    model = DenseNet(
+        num_init_features=64,
+        growth_rate=32,
+        block_config=(6, 12, 32, 32),
+        **kwargs)
+    return model
+
+
+def densenet201(**kwargs):
+    model = DenseNet(
+        num_init_features=64,
+        growth_rate=32,
+        block_config=(6, 12, 48, 32),
+        **kwargs)
+    return model
+
+
+def densenet264(**kwargs):
+    model = DenseNet(
+        num_init_features=64,
+        growth_rate=32,
+        block_config=(6, 12, 64, 48),
+        **kwargs)
+    return model
+
+
+class SqBn(nn.BatchNorm2d):
+    def forward(self, input):
+        return super().forward(input.squeeze(2))
+
+
+class _DenseLayer(nn.Sequential):
+
+    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
+        super(_DenseLayer, self).__init__()
+        self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
+        self.add_module('relu.1', nn.ReLU(inplace=True))
+        self.add_module('conv.1',
+                        nn.Conv3d(
+                            num_input_features,
+                            bn_size * growth_rate,
+                            kernel_size=1,
+                            stride=1,
+                            bias=False))
+        self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
+        self.add_module('relu.2', nn.ReLU(inplace=True))
+        self.add_module('conv.2',
+                        nn.Conv3d(
+                            bn_size * growth_rate,
+                            growth_rate,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=False))
+        self.drop_rate = drop_rate
+
+    def forward(self, x):
+        new_features = super(_DenseLayer, self).forward(x)
+        if self.drop_rate > 0:
+            new_features = F.dropout(
+                new_features, p=self.drop_rate, training=self.training)
+        return torch.cat([x, new_features], 1)
+
+
+class _DenseBlock(nn.Sequential):
+
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
+                 drop_rate):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(num_input_features + i * growth_rate,
+                                growth_rate, bn_size, drop_rate)
+            self.add_module('denselayer%d' % (i + 1), layer)
+
+
+class _Transition(nn.Sequential):
+
+    def __init__(self, num_input_features, num_output_features):
+        super(_Transition, self).__init__()
+        self.add_module('norm', nn.BatchNorm3d(num_input_features))
+        self.add_module('relu', nn.ReLU(inplace=True))
+        self.add_module('conv',
+                        nn.Conv3d(
+                            num_input_features,
+                            num_output_features,
+                            kernel_size=1,
+                            stride=1,
+                            bias=False))
+        self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
+
+
+class DenseNet(nn.Module):
+    """Densenet-BC model class
+    Args:
+        growth_rate (int) - how many filters to add each layer (k in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+    """
+
+    def __init__(self,
+                 sample_size,
+                 sample_duration,
+                 growth_rate=32,
+                 block_config=(6, 12, 24, 16),
+                 num_init_features=64,
+                 bn_size=4,
+                 drop_rate=0,
+                 num_classes=1000):
+
+        super(DenseNet, self).__init__()
+
+        self.sample_size = sample_size
+        self.sample_duration = sample_duration
+
+        # First convolution
+        self.features = nn.Sequential(
+            OrderedDict([
+                ('conv0',
+                 nn.Conv3d(
+                     3,
+                     num_init_features,
+                     kernel_size=7,
+                     stride=(1, 2, 2),
+                     padding=(3, 3, 3),
+                     bias=False)),
+                ('norm0', nn.BatchNorm3d(num_init_features)),
+                ('relu0', nn.ReLU(inplace=True)),
+                ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
+            ]))
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bn_size,
+                growth_rate=growth_rate,
+                drop_rate=drop_rate)
+            self.features.add_module('denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(
+                    num_input_features=num_features,
+                    num_output_features=num_features // 2)
+                self.features.add_module('transition%d' % (i + 1), trans)
+                num_features = num_features // 2
+
+        # Final batch norm
+        self.features.add_module('norm5', SqBn(num_features))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+
+    def forward(self, x):
+        features = self.features(x)
+        out = F.relu(features, inplace=True)
+        last_duration = int(math.ceil(self.sample_duration / 16))
+        last_size = int(math.floor(self.sample_size / 32))
+        out = F.avg_pool2d(
+            out, kernel_size=(last_size, last_size)).view(
+            features.size(0), -1)
+        out = self.classifier(out)
+        return out
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
--- a/action_recognition/models/inception_i3d.py
+++ b/action_recognition/models/inception_i3d.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..utils import get_fine_tuning_parameters
+
+
+def calc_same_padding(kernel_shape, stride=None):
+    if stride is None:
+        stride = (1,) * len(kernel_shape)
+    return [(ks - 1) // 2 for ks, st in zip(kernel_shape, stride)]
+
+
+def pad_same(input, kernel_size, stride=(1, 1, 1), dilation=(1, 1, 1), value=0):
+    t_left, t_right = get_pad_value(input.size(2), kernel_size[0], stride[0], dilation[0])
+    rows_left, rows_right = get_pad_value(input.size(3), kernel_size[1], stride[1], dilation[1])
+    cols_left, cols_right = get_pad_value(input.size(4), kernel_size[2], stride[2], dilation[2])
+
+    input = F.pad(input, (cols_left, cols_right, rows_left, rows_right, t_left, t_right), value=value)
+    return input
+
+
+def get_pad_value(input_size, filter_size, stride, dilation):
+    effective_filter_size = (filter_size - 1) * dilation + 1
+    out_size = (input_size + stride - 1) // stride
+    padding_needed = max(0, (out_size - 1) * stride + effective_filter_size - input_size)
+
+    padding_left = padding_needed // 2
+    padding_right = (padding_needed - 1) // 2 + 1
+    return padding_left, padding_right
+
+
+class Unit3d(nn.Module):
+
+    def __init__(self, input_channels, output_channels, kernel_shape=(1, 1, 1), stride=(1, 1, 1), use_batch_norm=True,
+                 use_bias=False, use_relu=True, padding_valid=True):
+        super().__init__()
+
+        self.conv_3d = nn.Conv3d(input_channels, output_channels, kernel_size=kernel_shape, stride=stride,
+                                 padding=calc_same_padding(kernel_shape, stride) if not padding_valid else 0,
+                                 bias=use_bias)
+
+        if use_batch_norm:
+            self.batch_norm = nn.BatchNorm3d(output_channels)
+
+            # self.batch_norm.weight.data.ones_()
+        if use_relu:
+            self.relu = nn.ReLU()
+
+        self.use_batch_norm = use_batch_norm
+        self.use_relu = use_relu
+        self.padding_valid = padding_valid
+
+    def forward(self, x):
+        x = self.conv_3d(pad_same(x, self.conv_3d.kernel_size, self.conv_3d.stride) if self.padding_valid else x)
+        first_conv = x
+        if self.use_batch_norm:
+            x = self.batch_norm(x)
+        if self.use_relu:
+            x = self.relu(x)
+
+        return x
+
+
+class InceptionBlock(nn.Module):
+    def __init__(self, input_channels, branch_channels):
+        super().__init__()
+
+        self.Branch_0_Conv3d_0a_1x1 = Unit3d(input_channels, branch_channels[0], kernel_shape=(1, 1, 1))
+
+        self.Branch_1_Conv3d_0a_1x1 = Unit3d(input_channels, branch_channels[1], kernel_shape=(1, 1, 1))
+        self.Branch_1_Conv3d_0b_3x3 = Unit3d(branch_channels[1], branch_channels[2], kernel_shape=(3, 3, 3))
+
+        self.Branch_2_Conv3d_0a_1x1 = Unit3d(input_channels, branch_channels[3], kernel_shape=(1, 1, 1))
+        self.Branch_2_Conv3d_0b_3x3 = Unit3d(branch_channels[3], branch_channels[4], kernel_shape=(3, 3, 3))
+
+        self.Branch_3_MaxPool3d_0a_3x3 = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1, 1, 1),
+                                                      padding=0)
+        self.Branch_3_Conv3d_0b_1x1 = Unit3d(input_channels, branch_channels[5], kernel_shape=(1, 1, 1))
+
+    def forward(self, x, endpoint=False):
+        branch0 = self.Branch_0_Conv3d_0a_1x1(x)
+
+        branch1 = self.Branch_1_Conv3d_0a_1x1(x)
+        branch1 = self.Branch_1_Conv3d_0b_3x3(branch1)
+
+        branch2 = self.Branch_2_Conv3d_0a_1x1(x)
+        branch2 = self.Branch_2_Conv3d_0b_3x3(branch2)
+
+        branch3 = self.Branch_3_MaxPool3d_0a_3x3(
+            pad_same(x, self.Branch_3_MaxPool3d_0a_3x3.kernel_size, self.Branch_3_MaxPool3d_0a_3x3.stride, value=-999))
+        branch3 = self.Branch_3_Conv3d_0b_1x1(branch3)
+        inner_endpoint = branch3
+
+        if not endpoint:
+            return torch.cat((branch0, branch1, branch2, branch3), dim=1)
+        else:
+            return torch.cat((branch0, branch1, branch2, branch3), dim=1), inner_endpoint
+
+
+class InceptionI3D(nn.Module):
+    def __init__(self, input_channels=3, num_classes=400, pretrain=False, dropout_rate=0.):
+        super().__init__()
+
+        assert pretrain is False, "Pretrain is not implemented"
+
+        self.Conv3d_1a_7x7 = Unit3d(input_channels, 64, kernel_shape=(7, 7, 7), stride=(2, 2, 2), padding_valid=True)
+
+        self.MaxPool3d_2a_3x3 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                             padding=0)
+        self.Conv3d_2b_1x1 = Unit3d(64, 64, kernel_shape=(1, 1, 1))
+        self.Conv3d_2c_3x3 = Unit3d(64, 192, kernel_shape=(3, 3, 3))
+
+        self.MaxPool3d_3a_3x3 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                             padding=0)
+        self.Mixed_3b = InceptionBlock(192, [64, 96, 128, 16, 32, 32])
+        self.Mixed_3c = InceptionBlock(256, [128, 128, 192, 32, 96, 64])
+
+        self.MaxPool3d_4a_3x3 = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(2, 2, 2),
+                                             padding=0)
+        self.Mixed_4b = InceptionBlock(480, [192, 96, 208, 16, 48, 64])
+        self.Mixed_4c = InceptionBlock(512, [160, 112, 224, 24, 64, 64])
+        self.Mixed_4d = InceptionBlock(512, [128, 128, 256, 24, 64, 64])
+        self.Mixed_4e = InceptionBlock(512, [112, 144, 288, 32, 64, 64])
+        self.Mixed_4f = InceptionBlock(528, [256, 160, 320, 32, 128, 128])
+
+        self.MaxPool3d_5a_2x2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2),
+                                             padding=0)
+        self.Mixed_5b = InceptionBlock(832, [256, 160, 320, 32, 128, 128])
+        self.Mixed_5c = InceptionBlock(832, [384, 192, 384, 48, 128, 128])
+
+        self.avg_pool = nn.AvgPool3d(kernel_size=(2, 7, 7), stride=(1, 1, 1))
+        self.dropout = nn.Dropout3d(dropout_rate)
+        self.logits = Unit3d(1024, num_classes, kernel_shape=(1, 1, 1), use_batch_norm=False, use_relu=False,
+                             use_bias=True)
+
+    def forward(self, x):
+        x = x.transpose(1, 2)
+
+        x = self.Conv3d_1a_7x7(x)
+
+        x = self.MaxPool3d_2a_3x3(pad_same(x, self.MaxPool3d_2a_3x3.kernel_size, self.MaxPool3d_2a_3x3.stride))
+        x = self.Conv3d_2b_1x1(x)
+        x = self.Conv3d_2c_3x3(x)
+
+        x = self.MaxPool3d_3a_3x3(pad_same(x, self.MaxPool3d_3a_3x3.kernel_size, self.MaxPool3d_3a_3x3.stride))
+        x = self.Mixed_3b(x)
+        x = self.Mixed_3c(x)
+
+        x = self.MaxPool3d_4a_3x3(pad_same(x, self.MaxPool3d_4a_3x3.kernel_size, self.MaxPool3d_4a_3x3.stride))
+        x = self.Mixed_4b(x)
+        x = self.Mixed_4c(x)
+        x = self.Mixed_4d(x)
+        x = self.Mixed_4e(x)
+        x = self.Mixed_4f(x)
+
+        x = self.MaxPool3d_5a_2x2(pad_same(x, self.MaxPool3d_5a_2x2.kernel_size, self.MaxPool3d_5a_2x2.stride))
+        x = self.Mixed_5b(x)
+        x = self.Mixed_5c(x)
+
+        x = self.avg_pool(x)
+        x = self.dropout(x)
+        logits = self.logits(x)
+
+        result = logits.mean(dim=2).squeeze(-1).squeeze(-1)
+        return result
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
--- a/action_recognition/models/lstm_attention.py
+++ b/action_recognition/models/lstm_attention.py
+from torch import nn
+from torch.nn import functional as F
+from torchvision import models as models
+
+from ..utils import get_fine_tuning_parameters, load_state
+from .backbone import make_encoder
+from .modules import (Attention, AttentionLSTM, StateInitZero, squash_dims,
+                      unsquash_dim)
+
+
+class VisualAttentionLSTM(nn.Module):
+    """LSTM architecture with attention mechanism (https://arxiv.org/pdf/1511.04119.pdf)"""
+
+    def __init__(self, embed_size, sequence_size, encoder='resnet34', n_classes=400, input_size=224, pretrained=True,
+                 use_attention=False, num_layers=1, bidirectional=False):
+        super().__init__()
+        self.use_attention = use_attention
+
+        # backbone
+        encoder = make_encoder(encoder, input_size=input_size, pretrained=pretrained)
+        self.resnet = encoder.features  # name is kept for compatibility with older checkpoints
+
+        self.dropout = nn.Dropout(p=0.5)
+
+        # self.state_init = StateInitFC(resnet1_channel_size, embed_size)
+        bidirectional_mult = 2 if bidirectional else 1
+        self.state_init = StateInitZero(embed_size, num_layers=num_layers * bidirectional_mult, batch_first=True)
+
+        if use_attention:
+            self.lstm = AttentionLSTM(encoder.features_shape[0], embed_size, encoder.features_shape[1] ** 2,
+                                      batch_first=True, num_layers=num_layers, dropout=0.2)
+        else:
+            self.lstm = nn.LSTM(encoder.features_shape[0], embed_size, num_layers=num_layers, dropout=0.2,
+                                batch_first=False, bidirectional=True)
+
+        self.fc = nn.Linear(bidirectional_mult * embed_size, n_classes)
+        self.out_dropout = nn.Dropout(0.5)
+
+        self.embed_size = embed_size
+        self.sequence_size = sequence_size
+        self.last_feature_size = encoder.features_shape[1]
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the weights."""
+        self.fc.weight.data.normal_(0.0, 0.02)
+        self.fc.bias.data.fill_(0)
+
+    def forward(self, images):
+        """Extract the image feature vectors."""
+        # (B x T x C x H x W) -> (B*T x C x H x W)
+        images = squash_dims(images, (0, 1))
+        features = self.resnet(images)
+        features = self.dropout(features)
+
+        features = unsquash_dim(features, 0, (-1, self.sequence_size))
+        hx, cx = self.state_init(features)
+        # early_features = early_features.transpose(0, 1)  # to T x B x H x W
+
+        # no attention
+        if not self.use_attention:
+            features = F.avg_pool2d(squash_dims(features, (0, 1)), 7)
+            features = unsquash_dim(features, 0, (-1, self.sequence_size))
+            features = features.squeeze(-1).squeeze(-1).transpose(0, 1)
+
+        ys, hidden = self.lstm(features, (hx, cx))
+        ys = ys.transpose(0, 1)
+
+        ys = self.fc(ys)
+        ys = ys.mean(1)
+        return ys
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
+
+    def load_checkpoint(self, state_dict):
+        load_state(self, state_dict, 'fc')
+
+
+class ResnetAttSingleInput(nn.Module):
+    """ONNX Exportable variant of the LSTM-Attenion model"""
+
+    def __init__(self, embed_size, sequence_size, n_classes=400, input_size=224, pretrained=True, resnet_size=50):
+        """Load the pretrained ResNet and replace top fc layer."""
+        super().__init__()
+
+        # backbone
+        resnet_cls = getattr(models, "resnet{}".format(resnet_size))
+        resnet_model = resnet_cls(pretrained=pretrained)
+
+        modules = list(resnet_model.children())[:-2]  # delete the last fc layer.
+        self.resnet1 = nn.Sequential(*modules)
+        self.dropout = nn.Dropout(p=0.5)
+
+        resnet1_channel_size = resnet_model.fc.in_features
+        resnet1_spatial_size = input_size // 32
+        self.last_feature_size = resnet1_spatial_size
+        self.embed_size = embed_size
+        self.sequence_size = sequence_size
+
+        num_layers = 1
+        self.attn = Attention(embed_size, None, self.last_feature_size * self.last_feature_size)
+        self.lstm = nn.LSTM(resnet1_channel_size, embed_size, num_layers=num_layers, dropout=0.2, batch_first=False)
+
+        self.fc = nn.Linear(embed_size, n_classes)
+        self.out_dropout = nn.Dropout(0.5)
+
+    def forward(self, images, hx, cx):
+        """Extract the image feature vectors."""
+        # (B x T x C x H x W) -> (B*T x C x H x W)
+        # images = squash_dims(images, (0, 1))
+        features = self.resnet1(images)
+
+        # features = unsquash_dim(features, 0, (-1, self.sequence_size))
+        features = unsquash_dim(features, 0, (-1, 1))
+        v = squash_dims(features[0].transpose(1, 0), (2, 3))
+        feature, attention = self.attn(hx[0], v, v)
+        feature = feature.permute((1, 0))
+
+        ys, (hx, cx) = self.lstm(feature.unsqueeze(0), (hx, cx))
+        ys = self.fc(ys)
+        ys = ys.mean(1)
+        return ys, hx, cx
+
+    def trainable_parameters(self):
+        return get_fine_tuning_parameters(self)
--- a/action_recognition/models/mobilenet_3d.py
+++ b/action_recognition/models/mobilenet_3d.py
+import math
+
+import torch.nn as nn
+
+
+class MobileNet(nn.Module):
+    @staticmethod
+    def conv_bn(inp, oup, stride):
+        return nn.Sequential(
+            nn.Conv3d(
+                inp,
+                oup,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False),
+            nn.BatchNorm3d(oup),
+            nn.ReLU(inplace=True)
+        )
+
+    @staticmethod
+    def conv_dw(inp, oup, stride):
+        return nn.Sequential(
+            nn.Conv3d(
+                inp,
+                inp,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=inp,
+                bias=False),
+            nn.BatchNorm3d(inp),
+            nn.ReLU(inplace=True),
+
+            nn.Conv3d(
+                inp,
+                oup,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False),
+            nn.BatchNorm3d(oup),
+            nn.ReLU(inplace=True)
+        )
+
+    def __init__(self, sample_size, sample_duration, num_classes=400, last_fc=True):
+        super(MobileNet, self).__init__()
+
+        self.last_fc = last_fc
+
+        self.model = nn.Sequential(
+            self.conv_bn(3, 32, 2),
+            self.conv_dw(32, 64, 1),
+            self.conv_dw(64, 128, 2),
+            self.conv_dw(128, 128, 1),
+            self.conv_dw(128, 256, 2),
+            self.conv_dw(256, 256, 1),
+            self.conv_dw(256, 512, 2),
+            self.conv_dw(512, 512, 1),
+            self.conv_dw(512, 512, 1),
+            self.conv_dw(512, 512, 1),
+            self.conv_dw(512, 512, 1),
+            self.conv_dw(512, 512, 1),
+            self.conv_dw(512, 1024, 2),
+            self.conv_dw(1024, 1024, 1),
+        )
+
+        last_duration = math.ceil(sample_duration / 16)
+        last_size = math.ceil(sample_size / 32)
+        self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
+
+        self.fc = nn.Linear(1024, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.model(x)
+
+        x = self.avgpool(x)
+
+        x = x.view(x.size(0), -1)
+        if self.last_fc:
+            x = self.fc(x)
+
+        return x
+
+
+class DepthWiseBlock(nn.Module):
+    def __init__(self, inp, oup, stride=1):
+        super(DepthWiseBlock, self).__init__()
+        self.conv1 = nn.Conv3d(
+            inp,
+            inp,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=inp,
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(inp)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv3d(
+            inp,
+            oup,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False)
+        self.bn2 = nn.BatchNorm3d(oup)
+        self.inplanes = inp
+        self.outplanes = oup
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.stride == 1 and self.inplanes == self.outplanes:
+            out += residual
+
+        out = self.relu(out)
+
+        return out
+
+
+class MobileNetResidual(nn.Module):
+    @staticmethod
+    def conv_bn(inp, oup, stride):
+        return nn.Sequential(
+            nn.Conv3d(
+                inp,
+                oup,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False),
+            nn.BatchNorm3d(oup),
+            nn.ReLU(inplace=True)
+        )
+
+    def __init__(self, sample_size, sample_duration, num_classes=400, last_fc=True):
+        super(MobileNetResidual, self).__init__()
+
+        self.last_fc = last_fc
+
+        self.model = nn.Sequential(
+            self.conv_bn(3, 32, 2),
+            DepthWiseBlock(32, 64, 1),
+            DepthWiseBlock(64, 128, (1, 2, 2)),
+            DepthWiseBlock(128, 128, 1),
+            DepthWiseBlock(128, 256, 2),
+            DepthWiseBlock(256, 256, 1),
+            DepthWiseBlock(256, 512, 2),
+            DepthWiseBlock(512, 512, 1),
+            DepthWiseBlock(512, 512, 1),
+            DepthWiseBlock(512, 512, 1),
+            DepthWiseBlock(512, 512, 1),
+            DepthWiseBlock(512, 512, 1),
+            DepthWiseBlock(512, 1024, 2),
+            DepthWiseBlock(1024, 1024, 1),
+        )
+
+        last_duration = math.ceil(sample_duration / 16)
+        last_size = math.ceil(sample_size / 32)
+        self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
+        self.dropout = nn.Dropout(p=0.5)
+
+        self.fc = nn.Linear(1024, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.model(x)
+
+        x = self.avgpool(x)
+        x = self.dropout(x)
+
+        x = x.view(x.size(0), -1)
+        if self.last_fc:
+            x = self.fc(x)
+
+        return x
--- a/action_recognition/models/modules/__init__.py
+++ b/action_recognition/models/modules/__init__.py
+from .functional import unsquash_dim, squash_dims, reduce_tensor
+from .modules import Identity, AttentionLSTM, Attention, SEBlock, StateInitFC, StateInitZero
+from . import self_attention, bnlstm, tcn
+
+__all__ = ['unsquash_dim', 'squash_dims', 'reduce_tensor', 'self_attention', 'bnlstm', 'tcn', 'Identity',
+           'AttentionLSTM', 'Attention', 'SEBlock', 'StateInitFC', 'StateInitZero']
--- a/action_recognition/models/modules/bnlstm.py
+++ b/action_recognition/models/modules/bnlstm.py
+"""Implementation of batch-normalized LSTM."""
+import torch
+from torch import nn
+from torch.nn import functional, init
+
+
+class SeparatedBatchNorm1d(nn.Module):
+
+    """
+    A batch normalization module which keeps its running mean
+    and variance separately per timestep.
+    """
+
+    def __init__(self, num_features, max_length, eps=1e-5, momentum=0.1,
+                 affine=True):
+        """
+        Most parts are copied from
+        torch.nn.modules.batchnorm._BatchNorm.
+        """
+
+        super(SeparatedBatchNorm1d, self).__init__()
+        self.num_features = num_features
+        self.max_length = max_length
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if self.affine:
+            self.weight = nn.Parameter(torch.FloatTensor(num_features))
+            self.bias = nn.Parameter(torch.FloatTensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        for i in range(max_length):
+            self.register_buffer(
+                'running_mean_{}'.format(i), torch.zeros(num_features))
+            self.register_buffer(
+                'running_var_{}'.format(i), torch.ones(num_features))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for i in range(self.max_length):
+            running_mean_i = getattr(self, 'running_mean_{}'.format(i))
+            running_var_i = getattr(self, 'running_var_{}'.format(i))
+            running_mean_i.zero_()
+            running_var_i.fill_(1)
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def _check_input_dim(self, input_):
+        if input_.size(1) != self.running_mean_0.nelement():
+            raise ValueError('got {}-feature tensor, expected {}'
+                             .format(input_.size(1), self.num_features))
+
+    def forward(self, input_, time):
+        self._check_input_dim(input_)
+        if time >= self.max_length:
+            time = self.max_length - 1
+        running_mean = getattr(self, 'running_mean_{}'.format(time))
+        running_var = getattr(self, 'running_var_{}'.format(time))
+        return functional.batch_norm(
+            input=input_, running_mean=running_mean, running_var=running_var,
+            weight=self.weight, bias=self.bias, training=self.training,
+            momentum=self.momentum, eps=self.eps)
+
+    def __repr__(self):
+        return ('{name}({num_features}, eps={eps}, momentum={momentum},'
+                ' max_length={max_length}, affine={affine})'
+                .format(name=self.__class__.__name__, **self.__dict__))
+
+
+class BNLSTMCell(nn.Module):
+
+    """A BN-LSTM cell."""
+
+    def __init__(self, input_size, hidden_size, max_length, use_bias=True):
+
+        super(BNLSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.max_length = max_length
+        self.use_bias = use_bias
+        self.weight_ih = nn.Parameter(
+            torch.FloatTensor(input_size, 4 * hidden_size))
+        self.weight_hh = nn.Parameter(
+            torch.FloatTensor(hidden_size, 4 * hidden_size))
+        if use_bias:
+            self.bias = nn.Parameter(torch.FloatTensor(4 * hidden_size))
+        else:
+            self.register_parameter('bias', None)
+        # BN parameters
+        self.bn_ih = SeparatedBatchNorm1d(
+            num_features=4 * hidden_size, max_length=max_length)
+        self.bn_hh = SeparatedBatchNorm1d(
+            num_features=4 * hidden_size, max_length=max_length)
+        self.bn_c = SeparatedBatchNorm1d(
+            num_features=hidden_size, max_length=max_length)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """
+        Initialize parameters following the way proposed in the paper.
+        """
+
+        # The input-to-hidden weight matrix is initialized orthogonally.
+        init.orthogonal(self.weight_ih.data)
+        # The hidden-to-hidden weight matrix is initialized as an identity
+        # matrix.
+        weight_hh_data = torch.eye(self.hidden_size)
+        weight_hh_data = weight_hh_data.repeat(1, 4)
+        self.weight_hh.data.set_(weight_hh_data)
+        # The bias is just set to zero vectors.
+        init.constant(self.bias.data, val=0)
+        # Initialization of BN parameters.
+        self.bn_ih.reset_parameters()
+        self.bn_hh.reset_parameters()
+        self.bn_c.reset_parameters()
+        self.bn_ih.bias.data.fill_(0)
+        self.bn_hh.bias.data.fill_(0)
+        self.bn_ih.weight.data.fill_(0.1)
+        self.bn_hh.weight.data.fill_(0.1)
+        self.bn_c.weight.data.fill_(0.1)
+
+    def forward(self, input_, hx, time):
+        """
+        Args:
+            input_: A (batch, input_size) tensor containing input
+                features.
+            hx: A tuple (h_0, c_0), which contains the initial hidden
+                and cell state, where the size of both states is
+                (batch, hidden_size).
+            time: The current timestep value, which is used to
+                get appropriate running statistics.
+
+        Returns:
+            h_1, c_1: Tensors containing the next hidden and cell state.
+        """
+
+        h_0, c_0 = hx
+        batch_size = h_0.size(0)
+        bias_batch = (self.bias.unsqueeze(0)
+                      .expand(batch_size, *self.bias.size()))
+        wh = torch.mm(h_0, self.weight_hh)
+        wi = torch.mm(input_, self.weight_ih)
+        bn_wh = self.bn_hh(wh, time=time)
+        bn_wi = self.bn_ih(wi, time=time)
+        f, i, o, g = torch.split(bn_wh + bn_wi + bias_batch, self.hidden_size, dim=1)
+        c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g)
+        h_1 = torch.sigmoid(o) * torch.tanh(self.bn_c(c_1, time=time))
+        return h_1, c_1
--- a/action_recognition/models/modules/functional.py
+++ b/action_recognition/models/modules/functional.py
+import torch
+
+
+def squash_dims(tensor, dims):
+    """
+    Squashes dimension, given in dims into one, which equals to product of given.
+
+    Args:
+        tensor (Tensor): input tensor
+        dims: dimensions over which tensor should be squashed
+
+    """
+    assert len(dims) >= 2, "Expected two or more dims to be squashed"
+
+    size = tensor.size()
+
+    squashed_dim = size[dims[0]]
+    for i in range(1, len(dims)):
+        assert dims[i] == dims[i - 1] + 1, "Squashed dims should be consecutive"
+        squashed_dim *= size[dims[i]]
+
+    result_dims = size[:dims[0]] + (squashed_dim,) + size[dims[-1] + 1:]
+    return tensor.contiguous().view(*result_dims)
+
+
+def unsquash_dim(tensor, dim, res_dim):
+    """
+    Unsquashes dimension, given in dim into separate dimensions given is res_dim
+    Args:
+        tensor (Tensor): input tensor
+        dim (int): dimension that should be unsquashed
+        res_dim (tuple): list of dimensions, that given dim should be unfolded to
+
+    """
+    size = tensor.size()
+    result_dim = size[:dim] + res_dim + size[dim + 1:]
+    return tensor.view(*result_dim)
+
+
+def reduce_tensor(tensor, dims, reduction=torch.sum):
+    """Performs reduction over multiple dimensions at once"""
+    permute_idx = [i for i, d in enumerate(tensor.size()) if i not in dims]
+    result_dims = [d for i, d in enumerate(tensor.size()) if i not in dims]
+    tensor = tensor.permute(*(permute_idx + list(dims))).contiguous()
+    return reduction(tensor.view(*result_dims, -1), -1)
--- a/action_recognition/models/modules/modules.py
+++ b/action_recognition/models/modules/modules.py
+import torch
+from torch import nn
+
+from .functional import squash_dims
+
+
+class SEBlock(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SEBlock, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+
+class Identity(nn.Module):
+    def forward(self, input_):
+        return input_
+
+
+class StateInitFC(nn.Module):
+    def __init__(self, init_size, hidden_size, activation=Identity):
+        super().__init__()
+
+        self.linear_h = nn.Linear(init_size, hidden_size)
+        self.linear_c = nn.Linear(init_size, hidden_size)
+        self.activation_h = activation()
+        self.activation_c = activation()
+
+        self.linear_h.weight.data.normal_(0.0, 0.02)
+        self.linear_h.bias.data.fill_(0)
+        self.linear_c.weight.data.normal_(0.0, 0.02)
+        self.linear_c.bias.data.fill_(0)
+
+    def forward(self, input_):
+        h0 = self.activation_h(self.linear_h(input_))
+        c0 = self.activation_c(self.linear_c(input_))
+        return h0, c0
+
+
+class StateInitZero(nn.Module):
+    def __init__(self, hidden_size, num_layers=1, batch_first=False):
+        super(StateInitZero, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.batch_first = batch_first
+
+    def forward(self, input: torch.Tensor):
+        h0 = input.new_zeros((self.num_layers, input.size(0 if self.batch_first else 1), self.hidden_size))
+        c0 = input.new_zeros((self.num_layers, input.size(0 if self.batch_first else 1), self.hidden_size))
+        return h0, c0
+
+
+class Attention(nn.Module):
+    def __init__(self, q_size, k_size, v_size):
+        super().__init__()
+
+        self.softmax = nn.Softmax(dim=1)
+        self.linear_q = nn.Linear(q_size, v_size)
+
+        self.linear_q.weight.data.normal_(0.0, 0.02)
+        self.linear_q.bias.data.fill_(0)
+
+    def forward(self, q, k, v):
+        attn_scores = self.linear_q(q)
+        attn_map = self.softmax(attn_scores.view(-1, attn_scores.size(-1)))
+
+        return (v * attn_map).sum(-1), attn_map
+
+
+class AttentionLSTM(nn.Module):
+    """LSTM with spatial attention """
+
+    def __init__(self, input_features, hidden_size, attention_size, batch_first=False, **kwargs):
+        super().__init__()
+
+        self.batch_first = batch_first
+        self.lstm = nn.LSTM(input_features, hidden_size, batch_first=False, **kwargs)
+        self.attention = Attention(hidden_size, None, attention_size)
+
+    def forward(self, x, hidden):
+        hx, cx = hidden
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        outputs = []
+        for i in range(x.size(0)):
+            # transpose in order to correctly broadcast while multiplying v
+            # squash dims in order to pull into vector (C x N x L)
+            v = squash_dims(x[i].transpose(0, 1), (2, 3))
+            feature, attention = self.attention(hx[-1], v, v)
+            feature = feature.transpose(0, 1)  # back to (N x C)
+
+            # unsqueeze to emulate sequence size = 1
+            _, (hx, cx) = self.lstm(feature.unsqueeze(0), (hx, cx))
+            outputs.append(hx)
+        ys = torch.cat(outputs, 0)
+
+        if self.batch_first:
+            ys = ys.transpose(0, 1)
+
+        return ys, (hx, cx)
--- a/action_recognition/models/modules/self_attention.py
+++ b/action_recognition/models/modules/self_attention.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torch.nn import functional as F
+
+from .modules import Identity
+
+
+class Linear(nn.Module):
+    """ Simple Linear layer with xavier init """
+
+    def __init__(self, d_in, d_out, bias=True):
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(d_in, d_out, bias=bias)
+        init.xavier_normal(self.linear.weight)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class Bottle(nn.Module):
+    """ Perform the reshape routine before and after an operation """
+
+    def forward(self, input):
+        if len(input.size()) <= 2:
+            return super(Bottle, self).forward(input)
+        size = input.size()[:2]
+        out = super(Bottle, self).forward(input.view(size[0] * size[1], -1))
+        return out.view(size[0], size[1], -1)
+
+
+class BottleLinear(Bottle, Linear):
+    """ Perform the reshape routine before and after a linear projection """
+    pass
+
+
+class BottleSoftmax(Bottle, nn.Softmax):
+    """ Perform the reshape routine before and after a softmax operation"""
+    pass
+
+
+class LayerNormalization(nn.Module):
+    """ Layer normalization module """
+
+    def __init__(self, d_hid, eps=1e-3):
+        super(LayerNormalization, self).__init__()
+
+        self.eps = eps
+        self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
+        self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
+
+    def forward(self, z):
+        if z.size(1) == 1:
+            return z
+
+        mu = torch.mean(z, keepdim=True, dim=-1)
+        sigma = torch.std(z, keepdim=True, dim=-1)
+        ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
+        ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
+
+        return ln_out
+
+
+class ScaledDotProductAttention(nn.Module):
+    """ Scaled Dot-Product Attention """
+
+    def __init__(self, d_model, attn_dropout=0.1):
+        super(ScaledDotProductAttention, self).__init__()
+        self.temper = np.power(d_model, 0.5)
+        self.dropout = nn.Dropout(attn_dropout)
+        self.softmax = BottleSoftmax()
+
+    def forward(self, q, k, v):
+        # q.size(): [nh*b x t x d_k]
+        # flops: b*nh * 2 * t * d_k * d_k = 8.4m
+        attn = torch.bmm(q, k.transpose(1, 2)) / self.temper
+
+        # flops: < 20k
+        attn = self.softmax(attn)
+        attn = self.dropout(attn)
+        # flops: 2*b*nh*t*t*d_k = 1.05m
+        output = torch.bmm(attn, v)
+
+        # flops: ~10m
+        return output, attn
+
+
+class MultiHeadAttention(nn.Module):
+    """ Multi-Head Attention module """
+
+    def __init__(self, n_head, input_size, output_size, d_k, d_v, dropout=0.1, use_proj=False, layer_norm=True):
+        """
+        Args:
+            n_head: Number of attention heads
+            input_size: Input feature size
+            output_size: Output feature size
+            d_k: Feature size for each head
+            d_v: Feature size for each head
+            dropout: Dropout rate after projection
+            use_proj: add additional projection to output feature space
+        """
+        super(MultiHeadAttention, self).__init__()
+
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        self.use_proj = use_proj
+
+        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, input_size, d_k))
+        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, input_size, d_k))
+        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, input_size, d_v))
+
+        self.attention = ScaledDotProductAttention(input_size)
+        self.layer_norm = LayerNormalization(input_size) if layer_norm else Identity()
+
+        if use_proj:
+            self.proj = Linear(n_head * d_v, output_size)
+
+        self.dropout = nn.Dropout(dropout)
+
+        init.xavier_normal_(self.w_qs)
+        init.xavier_normal_(self.w_ks)
+        init.xavier_normal_(self.w_vs)
+
+    def forward(self, q, k, v):
+        d_k, d_v = self.d_k, self.d_v
+        n_head = self.n_head
+
+        residual = q
+
+        mb_size, len_q, d_model = q.size()
+        mb_size, len_k, d_model = k.size()
+        mb_size, len_v, d_model = v.size()
+
+        # treat as a (n_head) size batch
+        q_s = q.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_q) x d_model
+        k_s = k.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_k) x d_model
+        v_s = v.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_v) x d_model
+
+        # treat the result as a (n_head * mb_size) size batch
+        # flops: 3 * 2 * t * nh * d_k * c = 12.6m
+        q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k)  # (n_head*mb_size) x len_q x d_k
+        k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k)  # (n_head*mb_size) x len_k x d_k
+        v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v)  # (n_head*mb_size) x len_v x d_v
+
+        # perform attention, result size = (n_head * mb_size) x len_q x d_v
+        # flops: 0.63m
+        outputs, attns = self.attention(q_s, k_s, v_s)
+
+        # back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v)
+
+        split_size = mb_size.item() if isinstance(mb_size, torch.Tensor) else mb_size
+        outputs = torch.cat(torch.split(outputs, split_size, dim=0), dim=-1)
+
+        if self.use_proj:
+            # project back to residual size
+            # flops: 2 * t * d_inner ^ 2 = 4.2m
+            outputs = self.proj(outputs)
+        outputs = self.dropout(outputs)
+
+        return self.layer_norm(outputs + residual), attns
+
+
+class PositionwiseFeedForward(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self, d_hid, d_inner_hid, dropout=0.1, layer_norm=True):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1)  # position-wise
+        self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1)  # position-wise
+        self.layer_norm = LayerNormalization(d_hid) if layer_norm else Identity()
+        self.dropout = nn.Dropout(dropout)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        output = self.relu(self.w_1(x.transpose(1, 2)))
+        output = self.w_2(output).transpose(2, 1)
+        output = self.dropout(output)
+        return self.layer_norm(output + residual)
+
+
+class DecoderBlock(nn.Module):
+    """ Compose with two layers """
+
+    def __init__(self, input_size, hidden_size, inner_hidden_size, n_head, d_k, d_v, dropout=0.1, layer_norm=True):
+        super(DecoderBlock, self).__init__()
+        # flops: 17.5m
+        self.slf_attn = MultiHeadAttention(n_head, input_size, hidden_size, d_k, d_v, dropout=dropout,
+                                           layer_norm=layer_norm)
+        # flops: 0.528m
+        self.pos_ffn = PositionwiseFeedForward(hidden_size, inner_hidden_size, dropout=dropout, layer_norm=layer_norm)
+
+    def forward(self, enc_input):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input
+        )
+        enc_output = self.pos_ffn(enc_output)
+        return enc_output, enc_slf_attn
+
+
+class PositionEncoding(nn.Module):
+    def __init__(self, n_positions, hidden_size):
+        super().__init__()
+        self.enc = nn.Embedding(n_positions, hidden_size, padding_idx=0)
+
+        position_enc = np.array([
+            [pos / np.power(10000, 2 * (j // 2) / hidden_size) for j in range(hidden_size)]
+            if pos != 0 else np.zeros(hidden_size) for pos in range(n_positions)])
+
+        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+        self.enc.weight = torch.nn.Parameter(torch.from_numpy(position_enc).to(self.enc.weight.device, torch.float))
+
+    def forward(self, x):
+        indeces = torch.arange(0, x.size(1)).to(self.enc.weight.device, torch.long)
+        encodings = self.enc(indeces)
+        x += encodings
+        return x
--- a/action_recognition/models/modules/sync_batchnorm/__init__.py
+++ b/action_recognition/models/modules/sync_batchnorm/__init__.py
+# -*- coding: utf-8 -*-
+# File   : __init__.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
+from .replicate import DataParallelWithCallback, patch_replication_callback
--- a/action_recognition/models/modules/sync_batchnorm/batchnorm.py
+++ b/action_recognition/models/modules/sync_batchnorm/batchnorm.py
--- a/action_recognition/models/modules/sync_batchnorm/comm.py
+++ b/action_recognition/models/modules/sync_batchnorm/comm.py
+# -*- coding: utf-8 -*-
+# File   : comm.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import collections
+import queue
+import threading
+
+__all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
+
+
+class FutureResult(object):
+    """A thread-safe future implementation. Used only as one-to-one pipe."""
+
+    def __init__(self):
+        self._result = None
+        self._lock = threading.Lock()
+        self._cond = threading.Condition(self._lock)
+
+    def put(self, result):
+        with self._lock:
+            assert self._result is None, 'Previous result has\'t been fetched.'
+            self._result = result
+            self._cond.notify()
+
+    def get(self):
+        with self._lock:
+            if self._result is None:
+                self._cond.wait()
+
+            res = self._result
+            self._result = None
+            return res
+
+
+_MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
+_SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
+
+
+class SlavePipe(_SlavePipeBase):
+    """Pipe for master-slave communication."""
+
+    def run_slave(self, msg):
+        self.queue.put((self.identifier, msg))
+        ret = self.result.get()
+        self.queue.put(True)
+        return ret
+
+
+class SyncMaster(object):
+    """An abstract `SyncMaster` object.
+
+    - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
+    call `register(id)` and obtain an `SlavePipe` to communicate with the master.
+    - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
+    and passed to a registered callback.
+    - After receiving the messages, the master device should gather the information and determine to message passed
+    back to each slave devices.
+    """
+
+    def __init__(self, master_callback):
+        """
+
+        Args:
+            master_callback: a callback to be invoked after having collected messages from slave devices.
+        """
+        self._master_callback = master_callback
+        self._queue = queue.Queue()
+        self._registry = collections.OrderedDict()
+        self._activated = False
+
+    def __getstate__(self):
+        return {'master_callback': self._master_callback}
+
+    def __setstate__(self, state):
+        self.__init__(state['master_callback'])
+
+    def register_slave(self, identifier):
+        """
+        Register an slave device.
+
+        Args:
+            identifier: an identifier, usually is the device id.
+
+        Returns: a `SlavePipe` object which can be used to communicate with the master device.
+
+        """
+        if self._activated:
+            assert self._queue.empty(), 'Queue is not clean before next initialization.'
+            self._activated = False
+            self._registry.clear()
+        future = FutureResult()
+        self._registry[identifier] = _MasterRegistry(future)
+        return SlavePipe(identifier, self._queue, future)
+
+    def run_master(self, master_msg):
+        """
+        Main entry for the master device in each forward pass.
+        The messages were first collected from each devices (including the master device), and then
+        an callback will be invoked to compute the message to be sent back to each devices
+        (including the master device).
+
+        Args:
+            master_msg: the message that the master want to send to itself. This will be placed as the first
+            message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
+
+        Returns: the message to be sent back to the master device.
+
+        """
+        self._activated = True
+
+        intermediates = [(0, master_msg)]
+        for i in range(self.nr_slaves):
+            intermediates.append(self._queue.get())
+
+        results = self._master_callback(intermediates)
+        assert results[0][0] == 0, 'The first result should belongs to the master.'
+
+        for i, res in results:
+            if i == 0:
+                continue
+            self._registry[i].result.put(res)
+
+        for i in range(self.nr_slaves):
+            assert self._queue.get() is True
+
+        return results[0][1]
+
+    @property
+    def nr_slaves(self):
+        return len(self._registry)
--- a/action_recognition/models/modules/sync_batchnorm/replicate.py
+++ b/action_recognition/models/modules/sync_batchnorm/replicate.py
+# -*- coding: utf-8 -*-
+# File   : replicate.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import functools
+
+from torch.nn.parallel.data_parallel import DataParallel
+
+__all__ = [
+    'CallbackContext',
+    'execute_replication_callbacks',
+    'DataParallelWithCallback',
+    'patch_replication_callback'
+]
+
+
+class CallbackContext(object):
+    pass
+
+
+def execute_replication_callbacks(modules):
+    """
+    Execute an replication callback `__data_parallel_replicate__` on each module created by original replication.
+
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Note that, as all modules are isomorphism, we assign each sub-module with a context
+    (shared among multiple copies of this module on different devices).
+    Through this context, different copies can share some information.
+
+    We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback
+    of any slave copies.
+    """
+    master_copy = modules[0]
+    nr_modules = len(list(master_copy.modules()))
+    ctxs = [CallbackContext() for _ in range(nr_modules)]
+
+    for i, module in enumerate(modules):
+        for j, m in enumerate(module.modules()):
+            if hasattr(m, '__data_parallel_replicate__'):
+                m.__data_parallel_replicate__(ctxs[j], i)
+
+
+class DataParallelWithCallback(DataParallel):
+    """
+    Data Parallel with a replication callback.
+
+    An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by
+    original `replicate` function.
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+        # sync_bn.__data_parallel_replicate__ will be invoked.
+    """
+
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelWithCallback, self).replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+
+def patch_replication_callback(data_parallel):
+    """
+    Monkey-patch an existing `DataParallel` object. Add the replication callback.
+    Useful when you have customized `DataParallel` implementation.
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
+        > patch_replication_callback(sync_bn)
+        # this is equivalent to
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+    """
+
+    assert isinstance(data_parallel, DataParallel)
+
+    old_replicate = data_parallel.replicate
+
+    @functools.wraps(old_replicate)
+    def new_replicate(module, device_ids):
+        modules = old_replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+    data_parallel.replicate = new_replicate
--- a/action_recognition/models/modules/sync_batchnorm/unittest.py
+++ b/action_recognition/models/modules/sync_batchnorm/unittest.py
+# -*- coding: utf-8 -*-
+# File   : unittest.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import unittest
+
+import numpy as np
+from torch.autograd import Variable
+
+
+def as_numpy(v):
+    if isinstance(v, Variable):
+        v = v.data
+    return v.cpu().numpy()
+
+
+class TorchTestCase(unittest.TestCase):
+    def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3):
+        npa, npb = as_numpy(a), as_numpy(b)
+        self.assertTrue(
+                np.allclose(npa, npb, atol=atol),
+                'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
+        )
--- a/action_recognition/models/modules/tcn.py
+++ b/action_recognition/models/modules/tcn.py
+from torch import nn
+from torch.nn.utils import weight_norm
+
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, :-self.chomp_size].contiguous()
+
+
+class TemporalBlock(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
+        super(TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
+                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
+        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        self.init_weights()
+
+    def init_weights(self):
+        self.conv1.weight.data.normal_(0, 0.01)
+        self.conv2.weight.data.normal_(0, 0.01)
+        if self.downsample is not None:
+            self.downsample.weight.data.normal_(0, 0.01)
+
+    def forward(self, x):
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TemporalConvNet(nn.Module):
+    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
+        super(TemporalConvNet, self).__init__()
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
+                                     padding=(kernel_size - 1) * dilation_size, dropout=dropout)]
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.network(x)
--- a/action_recognition/models/multi_frame_baseline.py
+++ b/action_recognition/models/multi_frame_baseline.py
+from torch import nn as nn
+from torch.nn import functional as F
+
+from ..utils import get_fine_tuning_parameters
+from .backbone import make_encoder
+from .modules import squash_dims, unsquash_dim
+
+
+class MultiFrameBaseline(nn.Module):
+    """Simple baseline that runs a classifier on each frame independently and averages logits."""
+
+    def __init__(self, sample_duration, encoder='resnet34', n_classes=400, input_size=224, pretrained=True,
+                 input_channels=3):
+        """Average prediction over multiple frames"""
+        super().__init__()
+
+        # backbone
+        encoder = make_encoder(encoder, input_size=input_size, input_channels=input_channels, pretrained=pretrained)
+        self.resnet = encoder.features  # name is kept for compatibility with older checkpoints
+        self.last_feature_size = encoder.features_shape[1]
+        self.fc = nn.Linear(encoder.features_shape[0], n_classes)
+        self.dropout = nn.Dropout2d(0.5)
+
+        self.sequence_size = sample_duration
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the weights."""
+        self.fc.weight.data.normal_(0.0, 0.02)
+        self.fc.bias.data.fill_(0)
+
+    def forward(self, images):
+        """Extract the image feature vectors."""
+        # (B x T x C x H x W) -> (B*T x C x H x W)
+        images = squash_dims(images, (0, 1))
+
+        features = self.resnet(images)
+        # features = self.dropout(features)
+
+        features = F.avg_pool2d(features, self.last_feature_size)  # (B*T) x C
+        features = unsquash_dim(features, 0, (-1, self.sequence_size))
+        ys = self.fc(features.squeeze(-1).squeeze(-1))
+
+        return ys.mean(1)
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
--- a/action_recognition/models/pre_act_resnet_3d.py
+++ b/action_recognition/models/pre_act_resnet_3d.py
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from ..utils import get_fine_tuning_parameters
+
+__all__ = [
+    'PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+    'resnet152', 'resnet200'
+]
+
+
+def conv3x3x3(in_planes, out_planes, stride=1):
+    # 3x3x3 convolution with padding
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class PreActivationBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(PreActivationBasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm3d(inplanes)
+        self.conv1 = conv3x3x3(inplanes, planes, stride)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.conv2 = conv3x3x3(planes, planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+
+        return out
+
+
+class PreActivationBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(PreActivationBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm3d(inplanes)
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.conv2 = nn.Conv3d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(planes)
+        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        out = self.bn3(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+
+        return out
+
+
+class PreActivationResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 sample_size,
+                 sample_duration,
+                 shortcut_type='B',
+                 num_classes=400):
+        self.inplanes = 64
+        super(PreActivationResNet, self).__init__()
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=(7, 7, 7),
+            stride=(2, 2, 2),
+            padding=(1, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], shortcut_type, stride=2)
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], shortcut_type, stride=2)
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], shortcut_type, stride=2)
+        last_duration = int(math.ceil(sample_duration / 16))
+        last_size = int(math.ceil(sample_size / 32))
+        self.avgpool = nn.AdaptiveAvgPool3d(1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(
+                    downsample_basic_block,
+                    planes=planes * block.expansion,
+                    stride=stride)
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv3d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
+
+
+def resnet18(**kwargs):
+    """Constructs a ResNet-18 model.
+    """
+    model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def resnet34(**kwargs):
+    """Constructs a ResNet-34 model.
+    """
+    model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet50(**kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet101(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3],
+                                **kwargs)
+    return model
+
+
+def resnet152(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3],
+                                **kwargs)
+    return model
+
+
+def resnet200(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3],
+                                **kwargs)
+    return model
--- a/action_recognition/models/r3d.py
+++ b/action_recognition/models/r3d.py
--- a/action_recognition/models/resnext_3d.py
+++ b/action_recognition/models/resnext_3d.py
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+__all__ = ['ResNeXt', 'resnet50', 'resnet101']
+
+
+def conv3x3x3(in_planes, out_planes, stride=1):
+    # 3x3x3 convolution with padding
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class ResNeXtBottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, cardinality, stride=1,
+                 downsample=None):
+        super(ResNeXtBottleneck, self).__init__()
+        mid_planes = cardinality * int(planes / 32)
+        self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm3d(mid_planes)
+        self.conv2 = nn.Conv3d(
+            mid_planes,
+            mid_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=cardinality,
+            bias=False)
+        self.bn2 = nn.BatchNorm3d(mid_planes)
+        self.conv3 = nn.Conv3d(
+            mid_planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 sample_size,
+                 sample_duration,
+                 shortcut_type='B',
+                 cardinality=32,
+                 num_classes=400):
+        self.inplanes = 64
+        super(ResNeXt, self).__init__()
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
+                                       cardinality)
+        self.layer2 = self._make_layer(
+            block, 256, layers[1], shortcut_type, cardinality, stride=2)
+        self.layer3 = self._make_layer(
+            block, 512, layers[2], shortcut_type, cardinality, stride=2)
+        self.layer4 = self._make_layer(
+            block, 1024, layers[3], shortcut_type, cardinality, stride=2)
+        last_duration = int(math.ceil(sample_duration / 16))
+        last_size = int(math.ceil(sample_size / 32))
+        self.avgpool = nn.AvgPool3d(
+            (last_duration, last_size, last_size), stride=1)
+        self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    shortcut_type,
+                    cardinality,
+                    stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(
+                    downsample_basic_block,
+                    planes=planes * block.expansion,
+                    stride=stride)
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv3d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, cardinality, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, cardinality))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet50(**kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet101(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def resnet152(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
+    return model
--- a/action_recognition/models/video_transformer.py
+++ b/action_recognition/models/video_transformer.py
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from ..utils import get_fine_tuning_parameters, load_state
+from .backbone import make_encoder
+from .modules import Identity, squash_dims, unsquash_dim
+from .modules.self_attention import DecoderBlock, PositionEncoding
+
+
+class VideoTransformer(nn.Module):
+    def __init__(self, embed_size, sequence_size, encoder='resnet34', n_classes=400, input_size=224, pretrained=True,
+                 input_channels=3, num_layers=4, layer_norm=True):
+        super().__init__()
+
+        # backbone
+        encoder = make_encoder(encoder, input_size=input_size, pretrained=pretrained, input_channels=input_channels)
+        self.resnet = encoder.features  # name is kept for compatibility with older checkpoints
+        self.last_feature_size = encoder.features_shape[1]
+
+        if encoder.features_shape[0] != embed_size:
+            self.reduce_conv = nn.Conv2d(encoder.features_shape[0], embed_size, 1)
+        else:
+            self.reduce_conv = Identity()
+
+        self.sequence_size = sequence_size
+
+        self.self_attention_decoder = SelfAttentionDecoder(embed_size, embed_size, [8] * num_layers,
+                                                           sequence_size, layer_norm=layer_norm)
+        self.fc = nn.Linear(embed_size, n_classes)
+        self.dropout = nn.Dropout2d(0.8)
+
+        self.init_weights()
+        self.input_channels = input_channels
+
+    def init_weights(self):
+        """Initialize the weights."""
+        self.fc.weight.data.normal_(0.0, 0.02)
+        self.fc.bias.data.fill_(0)
+
+    def forward(self, rgb_clip):
+        """Extract the image feature vectors."""
+        # (B x T x C x H x W) -> (B*T x C x H x W)
+        rgb_clip = squash_dims(rgb_clip, (0, 1))
+
+        features = self.resnet(rgb_clip)
+        features = self.reduce_conv(features)
+
+        features = F.avg_pool2d(features, 7)  # (B*T) x C
+        features = unsquash_dim(features, 0, (-1, self.sequence_size))
+        ys = self.self_attention_decoder(features[..., 0, 0])
+        # ys = self.dropout(ys)
+        ys = self.fc(ys)
+
+        return ys.mean(1)
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
+
+    def load_checkpoint(self, state_dict):
+        load_state(self, state_dict, 'fc')
+
+
+class SelfAttentionDecoder(nn.Module):
+    def __init__(self, input_size, hidden_size, n_heads, sequence_size, inner_hidden_factor=2, layer_norm=True):
+        super().__init__()
+
+        input_sizes = [hidden_size] * len(n_heads)
+        input_sizes[0] = input_size
+        hidden_sizes = [hidden_size] * len(n_heads)
+
+        self.position_encoding = PositionEncoding(sequence_size, hidden_size)
+
+        self.layers = nn.ModuleList([
+            DecoderBlock(inp_size, hid_size, hid_size * inner_hidden_factor, n_head, hid_size // n_head,
+                         hid_size // n_head, layer_norm=layer_norm)
+            for i, (inp_size, hid_size, n_head) in enumerate(zip(input_sizes, hidden_sizes, n_heads))
+        ])
+
+    def forward(self, x):
+        outputs, attentions = [], []
+        b, t, c = x.size()
+        x = self.position_encoding(x)
+
+        for layer in self.layers:
+            x, attn = layer(x)
+
+            outputs.append(x)
+        return x
--- a/action_recognition/models/vtn_motion.py
+++ b/action_recognition/models/vtn_motion.py
+import torch
+from torch import nn
+
+from ..utils import get_fine_tuning_parameters, load_state
+from .video_transformer import VideoTransformer
+
+
+class RGBDiff(nn.Module):
+    def __init__(self, dim=1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, image):
+        """
+        Args:
+            image (torch.Tensor):  (N x T x C x H x W)
+
+        """
+        diffs = []
+        for i in range(1, image.size(self.dim)):
+            prev = image.index_select(self.dim, image.new_tensor(i - 1, dtype=torch.long))
+            current = image.index_select(self.dim, image.new_tensor(i, dtype=torch.long))
+            diffs.append(current - prev)
+
+        return torch.cat(diffs, dim=self.dim)
+
+
+class VideoTransformerMotion(nn.Module):
+    def __init__(self, embed_size, sequence_size, encoder_name, n_classes=400, input_size=224, pretrained=True,
+                 mode='rfbdiff', layer_norm=True):
+        """Load the pretrained ResNet and replace top fc layer."""
+        super().__init__()
+        self.mode = mode
+        motion_sequence_size = sequence_size
+        input_channels = 3
+        if self.mode == "flow":
+            input_channels = 2
+        elif self.mode == "rgbdiff":
+            motion_sequence_size = motion_sequence_size - 1
+            self.rgb_diff = RGBDiff()
+        else:
+            raise Exception("Unsupported mode " + self.mode)
+
+        self.motion_decoder = VideoTransformer(embed_size, motion_sequence_size, encoder_name, n_classes=n_classes,
+                                               input_size=input_size, pretrained=pretrained,
+                                               input_channels=input_channels, layer_norm=layer_norm)
+
+    def forward(self, clip):
+        """Extract the image feature vectors."""
+        if self.mode == "rgbdiff":
+            clip = self.rgb_diff(clip)
+        logits_motion = self.motion_decoder(clip)
+
+        return logits_motion
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
+
+    def load_checkpoint(self, state_dict):
+        load_state(self, state_dict, 'motion_decoder.fc')
--- a/action_recognition/models/vtn_two_stream.py
+++ b/action_recognition/models/vtn_two_stream.py
+import torch
+from torch import nn
+
+from action_recognition.models.vtn_motion import VideoTransformerMotion
+
+from ..utils import get_fine_tuning_parameters, load_state
+from .video_transformer import VideoTransformer
+
+
+class VideoTransformerTwoStream(nn.Module):
+    def __init__(self, embed_size, sequence_size, encoder_name='resnet34', n_classes=400, input_size=224,
+                 pretrained=True, motion_path=None, rgb_path=None, mode='rgbdiff', layer_norm=True):
+        """Load the pretrained ResNet and replace top fc layer."""
+        super().__init__()
+
+        self.rgb_recoder = VideoTransformer(embed_size, sequence_size, encoder_name, n_classes=n_classes,
+                                            input_size=input_size, pretrained=pretrained, num_layers=4,
+                                            layer_norm=layer_norm)
+
+        self.motion_decoder = VideoTransformerMotion(embed_size, sequence_size, encoder_name, n_classes=n_classes,
+                                                     input_size=input_size, pretrained=pretrained, mode=mode,
+                                                     layer_norm=layer_norm)
+
+        if motion_path and rgb_path:
+            self.load_separate_trained(motion_path, rgb_path)
+
+    def load_separate_trained(self, motion_path, rgb_path):
+        print("Loading rgb model from: {}".format(rgb_path))
+        rgb_checkpoint = torch.load(rgb_path.as_posix())
+        self.rgb_recoder.load_checkpoint(rgb_checkpoint['state_dict'])
+
+        print("Loading motion model from: {}".format(motion_path))
+        motion_checkpoint = torch.load(motion_path.as_posix())
+        self.motion_decoder.load_checkpoint(motion_checkpoint['state_dict'])
+
+    def forward(self, rgb_clip=None, flow_clip=None):
+        """Extract the image feature vectors."""
+        logits_rgb = self.rgb_recoder(rgb_clip)
+        motion_input = rgb_clip
+        if flow_clip is not None:
+            motion_input = flow_clip
+        logits_motion = self.motion_decoder(motion_input)
+
+        return 0.5 * logits_rgb + 0.5 * logits_motion
+
+    def trainable_parameters(self):
+        param_groups = [
+            ('trainable', {'re': r''}),
+        ]
+
+        return get_fine_tuning_parameters(self, param_groups)
--- a/action_recognition/models/wide_resnet_3d.py
+++ b/action_recognition/models/wide_resnet_3d.py
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+__all__ = ['WideResNet', 'resnet50']
+
+
+def conv3x3x3(in_planes, out_planes, stride=1):
+    # 3x3x3 convolution with padding
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class WideBottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(WideBottleneck, self).__init__()
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm3d(planes)
+        self.conv2 = nn.Conv3d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.conv3 = nn.Conv3d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class WideResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 sample_size,
+                 sample_duration,
+                 k=1,
+                 shortcut_type='B',
+                 num_classes=400):
+        self.inplanes = 64
+        super(WideResNet, self).__init__()
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
+        self.layer2 = self._make_layer(
+            block, 128 * k, layers[1], shortcut_type, stride=2)
+        self.layer3 = self._make_layer(
+            block, 256 * k, layers[2], shortcut_type, stride=2)
+        self.layer4 = self._make_layer(
+            block, 512 * k, layers[3], shortcut_type, stride=2)
+        last_duration = int(math.ceil(sample_duration / 16))
+        last_size = int(math.ceil(sample_size / 32))
+        self.avgpool = nn.AvgPool3d(
+            (last_duration, last_size, last_size), stride=1)
+        self.fc = nn.Linear(512 * k * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(
+                    downsample_basic_block,
+                    planes=planes * block.expansion,
+                    stride=stride)
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv3d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet50(**kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
+    return model
--- a/action_recognition/options.py
+++ b/action_recognition/options.py
--- a/action_recognition/spatial_transforms.py
+++ b/action_recognition/spatial_transforms.py
--- a/action_recognition/target_transforms.py
+++ b/action_recognition/target_transforms.py
+class Compose(object):
+    """Compose multiple target transforms"""
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, target):
+        dst = []
+        for t in self.transforms:
+            dst.append(t(target))
+        return dst
+
+
+class ClassLabel(object):
+    """Returns video label and name. Used for training and validation."""
+
+    def __call__(self, target):
+        return {
+            'label': target['label'],
+            'video': target['video']
+        }
+
+
+class VideoID(object):
+    """Returns video name. Used for video prediction."""
+
+    def __call__(self, target):
+        return {
+            'label': target['label'],
+            'video_id': target['video_id']
+        }
--- a/action_recognition/temporal_transforms.py
+++ b/action_recognition/temporal_transforms.py
+import random
+
+
+class LoopPadding(object):
+    """Extend short clip to a given size"""
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, frame_indices):
+        out = frame_indices
+
+        for index in out:
+            if len(out) >= self.size:
+                break
+            out.append(index)
+
+        return out
+
+
+class TemporalStride:
+    """Skips frames with a given step. Increases effective temporal receptive field."""
+
+    def __init__(self, stride=1):
+        self.stride = stride
+
+    def __call__(self, frame_indices):
+        return frame_indices[::self.stride]
+
+
+class TemporalBeginCrop(object):
+    """Temporally crop the given frame indices at a beginning.
+
+    If the number of frames is less than the size,
+    loop the indices as many times as necessary to satisfy the size.
+
+    Args:
+        size (int): Desired output size of the crop.
+    """
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, frame_indices):
+        out = frame_indices[:self.size]
+
+        for index in out:
+            if len(out) >= self.size:
+                break
+            out.append(index)
+
+        return out
+
+
+class TemporalCenterCrop(object):
+    """Temporally crop the given frame indices at a center.
+
+    If the number of frames is less than the size,
+    loop the indices as many times as necessary to satisfy the size.
+
+    Args:
+        size (int): Desired output size of the crop.
+    """
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, frame_indices):
+        """
+        Args:
+            frame_indices (list): frame indices to be cropped.
+        Returns:
+            list: Cropped frame indices.
+        """
+
+        center_index = len(frame_indices) // 2
+        begin_index = max(0, center_index - (self.size // 2))
+        end_index = min(begin_index + self.size, len(frame_indices))
+
+        out = frame_indices[begin_index:end_index]
+
+        for index in out:
+            if len(out) >= self.size:
+                break
+            out.append(index)
+
+        return out
+
+
+class TemporalRandomCrop(object):
+    """Temporally crop the given frame indices at a random location.
+
+    If the number of frames is less than the size,
+    loop the indices as many times as necessary to satisfy the size.
+
+    Args:
+        size (int): Desired output size of the crop.
+    """
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, frame_indices):
+        """
+        Args:
+            frame_indices (list): frame indices to be cropped.
+        Returns:
+            list: Cropped frame indices.
+        """
+
+        rand_end = max(0, len(frame_indices) - self.size - 1)
+        begin_index = random.randint(0, rand_end)
+        end_index = min(begin_index + self.size, len(frame_indices))
+
+        out = frame_indices[int(begin_index):int(end_index)]
+
+        for index in out:
+            if len(out) >= self.size:
+                break
+            out.append(index)
+
+        return out
--- a/action_recognition/test.py
+++ b/action_recognition/test.py
+import torch
+import torch.nn.functional as F
+
+from .utils import AverageMeter, prepare_batch
+from .utils import calculate_accuracy
+
+def test(args, data_loader, model, logger):
+    print('test')
+    model.eval()
+
+    video_acc = AverageMeter()
+
+    output_buffer = []
+    previous_video_id = None
+    for i, (inputs, targets) in logger.scope_enumerate(data_loader):
+        video_ids = targets['video']
+        batch_size, inputs, labels = prepare_batch(args, inputs, targets)
+
+        outputs = model(*inputs)
+
+        if args.softmax_in_test:
+            outputs = F.softmax(outputs)
+
+        for j in range(outputs.size(0)):
+
+            if video_ids[j] != previous_video_id and not (i == 0 and j == 0):
+                # Computed all segments for current video
+                video_outputs = torch.stack(output_buffer)
+                video_result = torch.mean(video_outputs, dim=0)
+                probs, preds = torch.topk(video_result, k=1)
+
+                is_correct_match = (video_gt.cpu() == preds).item()
+                video_acc.update(is_correct_match)
+
+                output_buffer.clear()
+
+            output_buffer.append(outputs[j].data.cpu())
+            previous_video_id = video_ids[j]
+            video_gt = labels[j]
+
+        clip_acc = calculate_accuracy(outputs, labels)
+
+        logger.log_value("test/acc", clip_acc, batch_size)
+        logger.log_value("test/video", video_acc.avg)
+
+    return logger.get_value("test/video"), logger.get_value("test/acc")
--- a/action_recognition/train.py
+++ b/action_recognition/train.py
+import torch
+from torch.optim import lr_scheduler
+
+from .utils import (calculate_accuracy, prepare_batch,
+                    save_checkpoint)
+from .validation import validate
+
+
+def train_epoch(args, epoch, data_loader, model, criterion, optimizer, logger):
+    print('train at epoch {}'.format(epoch))
+    model.train()
+
+    for i, (inputs_dict, targets) in logger.scope_enumerate(data_loader, epoch, total_time='time/train_epoch',
+                                                            fetch_time='time/train_data', body_time='time/train_step'):
+        batch_size, inputs, labels = prepare_batch(args, inputs_dict, targets)
+        outputs = model(*inputs)
+
+        loss = criterion(outputs=outputs, inputs=inputs, targets=labels)
+        acc = calculate_accuracy(outputs, labels)
+
+        if i % args.iter_size == 0:
+            optimizer.zero_grad()
+
+        loss.backward()
+
+        if args.gradient_clipping:
+            torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clipping)
+
+        if args.iter_size > 1 and (i + 1) % args.iter_size == 0:
+            for p in model.parameters():
+                p.grad.data.mul_(1 / args.iter_size)
+
+        optimizer.step()
+
+        logger.log_value('train/loss', loss.item(), batch_size)
+        logger.log_value('train/acc', acc, batch_size)
+        if 'kd' in criterion.values:
+            logger.log_value("train/kd_loss", criterion.values['kd'].item())
+
+    logger.log_value("train/epoch_loss", logger.get_value("train/loss"))
+    logger.log_value("train/epoch_acc", logger.get_value("train/acc"))
+
+    return logger.get_value("train/acc"), logger.get_value("train/loss")
+
+
+def train(args, model, train_loader, val_loader, criterion, optimizer, scheduler, logger):
+    for epoch in range(args.begin_epoch, args.n_epochs + 1):
+        with logger.scope(epoch):
+            for i, group in enumerate(optimizer.param_groups):
+                group_name = group.get('group_name', i)
+                logger.log_value("lr/{}".format(group_name), group['lr'])
+
+        with logger.scope(epoch):
+            train_acc, loss = train_epoch(args, epoch, train_loader, model, criterion, optimizer, logger)
+
+        if epoch % args.checkpoint == 0:
+            checkpoint_name = 'save_{}.pth'.format(epoch)
+            save_checkpoint(checkpoint_name, model, optimizer, epoch, args)
+
+        with logger.scope(epoch):
+            val_acc = validate(args, epoch, val_loader, model, criterion, logger)
+            logger.log_value("val/generalization_error", val_acc - train_acc)
+
+        if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
+            scheduler.step(val_acc)
+        else:
+            scheduler.step()
+
+        logger.reset_values('train')
+        logger.reset_values('val')
--- a/action_recognition/utils.py
+++ b/action_recognition/utils.py
--- a/action_recognition/validation.py
+++ b/action_recognition/validation.py
--- a/action_recognition/video_reader.py
+++ b/action_recognition/video_reader.py
--- a/app.py
+++ b/app.py
+import os
+
+from edge_engine.edge_processor import ExecutePipeline
+from edge_engine.edge_processor import Pubs
+from scripts import ActionRecognition
+from edge_engine.common.config import EDGE_CONFIG
+import time
+if __name__ == '__main__':
+    pubs = Pubs()
+    mod = ActionRecognition(config=EDGE_CONFIG,
+                           model_config=EDGE_CONFIG["modelConfig"],
+                           pubs=pubs,
+                           device_id=EDGE_CONFIG['deviceId'])
+    # mod._predict()
+    ex = ExecutePipeline(mod)
+
+    ex.run_model()
--- a/app.sh
+++ b/app.sh
--- a/conf/application.conf
+++ b/conf/application.conf
--- a/config.py
+++ b/config.py
--- a/data/kinari.mp4
+++ b/data/kinari.mp4
--- a/data/labels.txt
+++ b/data/labels.txt
--- a/database.ini
+++ b/database.ini
--- a/edge_engine-0.0.1a0-py3-none-any.whl
+++ b/edge_engine-0.0.1a0-py3-none-any.whl
--- a/edge_engine/__init__.py
+++ b/edge_engine/__init__.py
--- a/edge_engine/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/__pycache__/edge_processor.cpython-38.pyc
+++ b/edge_engine/__pycache__/edge_processor.cpython-38.pyc
--- a/edge_engine/ai/__init__.py
+++ b/edge_engine/ai/__init__.py
--- a/edge_engine/ai/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/ai/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/ai/model/__init__.py
+++ b/edge_engine/ai/model/__init__.py
--- a/edge_engine/ai/model/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/ai/model/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/ai/model/__pycache__/modelwraper.cpython-38.pyc
+++ b/edge_engine/ai/model/__pycache__/modelwraper.cpython-38.pyc
--- a/edge_engine/ai/model/modelwraper.py
+++ b/edge_engine/ai/model/modelwraper.py
--- a/edge_engine/ai/postprocess/__init__.py
+++ b/edge_engine/ai/postprocess/__init__.py
--- a/edge_engine/ai/preprocess/__init__.py
+++ b/edge_engine/ai/preprocess/__init__.py
--- a/edge_engine/ai/preprocess/gammapreprocessor.py
+++ b/edge_engine/ai/preprocess/gammapreprocessor.py
--- a/edge_engine/ai/preprocess/imagetoarraypreprocessor.py
+++ b/edge_engine/ai/preprocess/imagetoarraypreprocessor.py
--- a/edge_engine/ai/preprocess/simplehistogramprocessor.py
+++ b/edge_engine/ai/preprocess/simplehistogramprocessor.py
--- a/edge_engine/ai/preprocess/simplepreprocessor.py
+++ b/edge_engine/ai/preprocess/simplepreprocessor.py
--- a/edge_engine/common/__init__.py
+++ b/edge_engine/common/__init__.py
--- a/edge_engine/common/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/common/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/common/__pycache__/config.cpython-38.pyc
+++ b/edge_engine/common/__pycache__/config.cpython-38.pyc
--- a/edge_engine/common/__pycache__/constants.cpython-38.pyc
+++ b/edge_engine/common/__pycache__/constants.cpython-38.pyc
--- a/edge_engine/common/__pycache__/logsetup.cpython-38.pyc
+++ b/edge_engine/common/__pycache__/logsetup.cpython-38.pyc
--- a/edge_engine/common/__pycache__/minio_server.cpython-38.pyc
+++ b/edge_engine/common/__pycache__/minio_server.cpython-38.pyc
--- a/edge_engine/common/config.py
+++ b/edge_engine/common/config.py
--- a/edge_engine/common/config1.py
+++ b/edge_engine/common/config1.py
--- a/edge_engine/common/constants.py
+++ b/edge_engine/common/constants.py
--- a/edge_engine/common/logsetup.py
+++ b/edge_engine/common/logsetup.py
--- a/edge_engine/common/minio_server.py
+++ b/edge_engine/common/minio_server.py
--- a/edge_engine/edge_processor.py
+++ b/edge_engine/edge_processor.py
--- a/edge_engine/streamio/__init__.py
+++ b/edge_engine/streamio/__init__.py
--- a/edge_engine/streamio/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/streamio/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/streamio/__pycache__/datastreamprocessor.cpython-38.pyc
+++ b/edge_engine/streamio/__pycache__/datastreamprocessor.cpython-38.pyc
--- a/edge_engine/streamio/__pycache__/frameProcessor.cpython-38.pyc
+++ b/edge_engine/streamio/__pycache__/frameProcessor.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__init__.py
+++ b/edge_engine/streamio/datastream/__init__.py
--- a/edge_engine/streamio/datastream/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/datastreamwrapper.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/datastreamwrapper.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/ffmpegdata_streamout.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/ffmpegdata_streamout.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/frameoutputstream.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/frameoutputstream.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/mongodatastreamout.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/mongodatastreamout.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/mqttstream.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/mqttstream.cpython-38.pyc
--- a/edge_engine/streamio/datastream/__pycache__/videooutputstream.cpython-38.pyc
+++ b/edge_engine/streamio/datastream/__pycache__/videooutputstream.cpython-38.pyc
--- a/edge_engine/streamio/datastream/datastreamwrapper.py
+++ b/edge_engine/streamio/datastream/datastreamwrapper.py
--- a/edge_engine/streamio/datastream/ffmpegdata_streamout.py
+++ b/edge_engine/streamio/datastream/ffmpegdata_streamout.py
--- a/edge_engine/streamio/datastream/frameoutputstream.py
+++ b/edge_engine/streamio/datastream/frameoutputstream.py
--- a/edge_engine/streamio/datastream/mongodatastreamout.py
+++ b/edge_engine/streamio/datastream/mongodatastreamout.py
--- a/edge_engine/streamio/datastream/mqttstream.py
+++ b/edge_engine/streamio/datastream/mqttstream.py
--- a/edge_engine/streamio/datastream/videooutputstream.py
+++ b/edge_engine/streamio/datastream/videooutputstream.py
--- a/edge_engine/streamio/datastreamprocessor.py
+++ b/edge_engine/streamio/datastreamprocessor.py
--- a/edge_engine/streamio/frameProcessor.py
+++ b/edge_engine/streamio/frameProcessor.py
--- a/edge_engine/streamio/videostream/__init__.py
+++ b/edge_engine/streamio/videostream/__init__.py
--- a/edge_engine/streamio/videostream/__pycache__/__init__.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/__init__.cpython-38.pyc
--- a/edge_engine/streamio/videostream/__pycache__/filevideostream.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/filevideostream.cpython-38.pyc
--- a/edge_engine/streamio/videostream/__pycache__/fps.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/fps.cpython-38.pyc
--- a/edge_engine/streamio/videostream/__pycache__/nvgstreamer.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/nvgstreamer.cpython-38.pyc
--- a/edge_engine/streamio/videostream/__pycache__/simplevideostream.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/simplevideostream.cpython-38.pyc
--- a/edge_engine/streamio/videostream/__pycache__/threadedvideostream.cpython-38.pyc
+++ b/edge_engine/streamio/videostream/__pycache__/threadedvideostream.cpython-38.pyc
--- a/edge_engine/streamio/videostream/filevideostream.py
+++ b/edge_engine/streamio/videostream/filevideostream.py
--- a/edge_engine/streamio/videostream/fps.py
+++ b/edge_engine/streamio/videostream/fps.py
--- a/edge_engine/streamio/videostream/nvgstreamer.py
+++ b/edge_engine/streamio/videostream/nvgstreamer.py
--- a/edge_engine/streamio/videostream/simplevideostream.py
+++ b/edge_engine/streamio/videostream/simplevideostream.py
--- a/edge_engine/streamio/videostream/threadedvideostream.py
+++ b/edge_engine/streamio/videostream/threadedvideostream.py
--- a/edge_engine/version.py
+++ b/edge_engine/version.py
--- a/gokaldas.json
+++ b/gokaldas.json
--- a/logs/ilens-edge_engine.log
+++ b/logs/ilens-edge_engine.log
--- a/logs/ilens-edge_engine.log.1
+++ b/logs/ilens-edge_engine.log.1
--- a/logs/ilens-edge_engine.log.2
+++ b/logs/ilens-edge_engine.log.2
--- a/logs/ilens-edge_engine.log.3
+++ b/logs/ilens-edge_engine.log.3
--- a/logs/ilens-edge_engine.log.4
+++ b/logs/ilens-edge_engine.log.4
--- a/logs/ilens-edge_engine.log.5
+++ b/logs/ilens-edge_engine.log.5
--- a/request.json
+++ b/request.json
--- a/requirements.txt
+++ b/requirements.txt
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
--- a/scripts/__pycache__/__init__.cpython-38.pyc
+++ b/scripts/__pycache__/__init__.cpython-38.pyc
--- a/scripts/__pycache__/action_rec.cpython-38.pyc
+++ b/scripts/__pycache__/action_rec.cpython-38.pyc
--- a/scripts/action_rec.py
+++ b/scripts/action_rec.py
--- a/scripts/action_recognition/__init__.py
+++ b/scripts/action_recognition/__init__.py
--- a/scripts/action_recognition/annotation.py
+++ b/scripts/action_recognition/annotation.py
--- a/scripts/action_recognition/dataset.py
+++ b/scripts/action_recognition/dataset.py
--- a/scripts/action_recognition/logging.py
+++ b/scripts/action_recognition/logging.py
--- a/scripts/action_recognition/loss.py
+++ b/scripts/action_recognition/loss.py
--- a/scripts/action_recognition/model.py
+++ b/scripts/action_recognition/model.py
--- a/scripts/action_recognition/models/__init__.py
+++ b/scripts/action_recognition/models/__init__.py
--- a/scripts/action_recognition/models/backbone/__init__.py
+++ b/scripts/action_recognition/models/backbone/__init__.py
--- a/scripts/action_recognition/models/backbone/mobilenetv2.py
+++ b/scripts/action_recognition/models/backbone/mobilenetv2.py
--- a/scripts/action_recognition/models/backbone/resnet.py
+++ b/scripts/action_recognition/models/backbone/resnet.py
--- a/scripts/action_recognition/models/backbone/rmnet.py
+++ b/scripts/action_recognition/models/backbone/rmnet.py
--- a/scripts/action_recognition/models/densenet_3d.py
+++ b/scripts/action_recognition/models/densenet_3d.py
--- a/scripts/action_recognition/models/inception_i3d.py
+++ b/scripts/action_recognition/models/inception_i3d.py
--- a/scripts/action_recognition/models/lstm_attention.py
+++ b/scripts/action_recognition/models/lstm_attention.py
--- a/scripts/action_recognition/models/mobilenet_3d.py
+++ b/scripts/action_recognition/models/mobilenet_3d.py
--- a/scripts/action_recognition/models/modules/__init__.py
+++ b/scripts/action_recognition/models/modules/__init__.py
--- a/scripts/action_recognition/models/modules/bnlstm.py
+++ b/scripts/action_recognition/models/modules/bnlstm.py
--- a/scripts/action_recognition/models/modules/functional.py
+++ b/scripts/action_recognition/models/modules/functional.py
--- a/scripts/action_recognition/models/modules/modules.py
+++ b/scripts/action_recognition/models/modules/modules.py
--- a/scripts/action_recognition/models/modules/self_attention.py
+++ b/scripts/action_recognition/models/modules/self_attention.py
--- a/scripts/action_recognition/models/modules/sync_batchnorm/__init__.py
+++ b/scripts/action_recognition/models/modules/sync_batchnorm/__init__.py
--- a/scripts/action_recognition/models/modules/sync_batchnorm/batchnorm.py
+++ b/scripts/action_recognition/models/modules/sync_batchnorm/batchnorm.py
--- a/scripts/action_recognition/models/modules/sync_batchnorm/comm.py
+++ b/scripts/action_recognition/models/modules/sync_batchnorm/comm.py
--- a/scripts/action_recognition/models/modules/sync_batchnorm/replicate.py
+++ b/scripts/action_recognition/models/modules/sync_batchnorm/replicate.py
--- a/scripts/action_recognition/models/modules/sync_batchnorm/unittest.py
+++ b/scripts/action_recognition/models/modules/sync_batchnorm/unittest.py
--- a/scripts/action_recognition/models/modules/tcn.py
+++ b/scripts/action_recognition/models/modules/tcn.py
--- a/scripts/action_recognition/models/multi_frame_baseline.py
+++ b/scripts/action_recognition/models/multi_frame_baseline.py
--- a/scripts/action_recognition/models/pre_act_resnet_3d.py
+++ b/scripts/action_recognition/models/pre_act_resnet_3d.py
--- a/scripts/action_recognition/models/r3d.py
+++ b/scripts/action_recognition/models/r3d.py
--- a/scripts/action_recognition/models/resnext_3d.py
+++ b/scripts/action_recognition/models/resnext_3d.py
--- a/scripts/action_recognition/models/video_transformer.py
+++ b/scripts/action_recognition/models/video_transformer.py
--- a/scripts/action_recognition/models/vtn_motion.py
+++ b/scripts/action_recognition/models/vtn_motion.py
--- a/scripts/action_recognition/models/vtn_two_stream.py
+++ b/scripts/action_recognition/models/vtn_two_stream.py
--- a/scripts/action_recognition/models/wide_resnet_3d.py
+++ b/scripts/action_recognition/models/wide_resnet_3d.py
--- a/scripts/action_recognition/options.py
+++ b/scripts/action_recognition/options.py
--- a/scripts/action_recognition/spatial_transforms.py
+++ b/scripts/action_recognition/spatial_transforms.py
--- a/scripts/action_recognition/target_transforms.py
+++ b/scripts/action_recognition/target_transforms.py
--- a/scripts/action_recognition/temporal_transforms.py
+++ b/scripts/action_recognition/temporal_transforms.py
--- a/scripts/action_recognition/test.py
+++ b/scripts/action_recognition/test.py
--- a/scripts/action_recognition/train.py
+++ b/scripts/action_recognition/train.py
--- a/scripts/action_recognition/utils.py
+++ b/scripts/action_recognition/utils.py
--- a/scripts/action_recognition/validation.py
+++ b/scripts/action_recognition/validation.py
--- a/scripts/action_recognition/video_reader.py
+++ b/scripts/action_recognition/video_reader.py
--- a/scripts/cement_counter3.py
+++ b/scripts/cement_counter3.py
--- a/scripts/cement_counter_for_acc_final - Copy.py
+++ b/scripts/cement_counter_for_acc_final - Copy.py
--- a/scripts/common/__pycache__/config.cpython-38.pyc
+++ b/scripts/common/__pycache__/config.cpython-38.pyc
--- a/scripts/common/config.py
+++ b/scripts/common/config.py
--- a/scripts/common/constants.py
+++ b/scripts/common/constants.py
--- a/scripts/config.py
+++ b/scripts/config.py
--- a/scripts/config/__init__.py
+++ b/scripts/config/__init__.py
--- a/scripts/constants/api_endpoints.py
+++ b/scripts/constants/api_endpoints.py
--- a/scripts/constants/secrets.py
+++ b/scripts/constants/secrets.py
--- a/scripts/db/redis_connections.py
+++ b/scripts/db/redis_connections.py
--- a/scripts/errors/__init__.py
+++ b/scripts/errors/__init__.py
--- a/scripts/errors/error_messages.py
+++ b/scripts/errors/error_messages.py
--- a/scripts/loopbackmodel.py
+++ b/scripts/loopbackmodel.py
--- a/scripts/sikhin.ini
+++ b/scripts/sikhin.ini
--- a/scripts/utils/__init__.py
+++ b/scripts/utils/__init__.py
--- a/scripts/utils/__pycache__/__init__.cpython-38.pyc
+++ b/scripts/utils/__pycache__/__init__.cpython-38.pyc
--- a/scripts/utils/__pycache__/infocenter.cpython-38.pyc
+++ b/scripts/utils/__pycache__/infocenter.cpython-38.pyc
--- a/scripts/utils/__pycache__/security/__init__.py
+++ b/scripts/utils/__pycache__/security/__init__.py
--- a/scripts/utils/__pycache__/security/apply_encrytion_util.py
+++ b/scripts/utils/__pycache__/security/apply_encrytion_util.py
--- a/scripts/utils/__pycache__/security/decorators.py
+++ b/scripts/utils/__pycache__/security/decorators.py
--- a/scripts/utils/__pycache__/security/jwt_util.py
+++ b/scripts/utils/__pycache__/security/jwt_util.py
--- a/scripts/utils/auth_util.py
+++ b/scripts/utils/auth_util.py
--- a/scripts/utils/centroidtracker.py
+++ b/scripts/utils/centroidtracker.py
--- a/scripts/utils/edge_utils.py
+++ b/scripts/utils/edge_utils.py
--- a/scripts/utils/helpers.py
+++ b/scripts/utils/helpers.py
--- a/scripts/utils/image_utils.py
+++ b/scripts/utils/image_utils.py
--- a/scripts/utils/infocenter.py
+++ b/scripts/utils/infocenter.py
--- a/scripts/utils/model_tracker.py
+++ b/scripts/utils/model_tracker.py
--- a/scripts/utils/security/__init__.py
+++ b/scripts/utils/security/__init__.py
--- a/scripts/utils/security/apply_encrytion_util.py
+++ b/scripts/utils/security/apply_encrytion_util.py
--- a/scripts/utils/security/decorators.py
+++ b/scripts/utils/security/decorators.py
--- a/scripts/utils/security/jwt_util.py
+++ b/scripts/utils/security/jwt_util.py
--- a/scripts/utils/tracker.py
+++ b/scripts/utils/tracker.py
--- a/test_typer.py
+++ b/test_typer.py
--- a/tests/test_CementBagCounter.py
+++ b/tests/test_CementBagCounter.py
--- a/yolov5processor/__init__.py
+++ b/yolov5processor/__init__.py
--- a/yolov5processor/infer.py
+++ b/yolov5processor/infer.py
--- a/yolov5processor/models/__init__.py
+++ b/yolov5processor/models/__init__.py
--- a/yolov5processor/models/common.py
+++ b/yolov5processor/models/common.py
--- a/yolov5processor/models/experimental.py
+++ b/yolov5processor/models/experimental.py
--- a/yolov5processor/models/hub/anchors.yaml
+++ b/yolov5processor/models/hub/anchors.yaml
--- a/yolov5processor/models/hub/yolov3-spp.yaml
+++ b/yolov5processor/models/hub/yolov3-spp.yaml
--- a/yolov5processor/models/hub/yolov3-tiny.yaml
+++ b/yolov5processor/models/hub/yolov3-tiny.yaml
--- a/yolov5processor/models/hub/yolov3.yaml
+++ b/yolov5processor/models/hub/yolov3.yaml
--- a/yolov5processor/models/hub/yolov5-bifpn.yaml
+++ b/yolov5processor/models/hub/yolov5-bifpn.yaml
--- a/yolov5processor/models/hub/yolov5-fpn.yaml
+++ b/yolov5processor/models/hub/yolov5-fpn.yaml
--- a/yolov5processor/models/hub/yolov5-p2.yaml
+++ b/yolov5processor/models/hub/yolov5-p2.yaml
--- a/yolov5processor/models/hub/yolov5-p6.yaml
+++ b/yolov5processor/models/hub/yolov5-p6.yaml
--- a/yolov5processor/models/hub/yolov5-p7.yaml
+++ b/yolov5processor/models/hub/yolov5-p7.yaml
--- a/yolov5processor/models/hub/yolov5-panet.yaml
+++ b/yolov5processor/models/hub/yolov5-panet.yaml
--- a/yolov5processor/models/hub/yolov5l6.yaml
+++ b/yolov5processor/models/hub/yolov5l6.yaml
--- a/yolov5processor/models/hub/yolov5m6.yaml
+++ b/yolov5processor/models/hub/yolov5m6.yaml
--- a/yolov5processor/models/hub/yolov5n6.yaml
+++ b/yolov5processor/models/hub/yolov5n6.yaml
--- a/yolov5processor/models/hub/yolov5s-ghost.yaml
+++ b/yolov5processor/models/hub/yolov5s-ghost.yaml
--- a/yolov5processor/models/hub/yolov5s-transformer.yaml
+++ b/yolov5processor/models/hub/yolov5s-transformer.yaml
--- a/yolov5processor/models/hub/yolov5s6.yaml
+++ b/yolov5processor/models/hub/yolov5s6.yaml
--- a/yolov5processor/models/hub/yolov5x6.yaml
+++ b/yolov5processor/models/hub/yolov5x6.yaml
--- a/yolov5processor/models/tf.py
+++ b/yolov5processor/models/tf.py
--- a/yolov5processor/models/yolo.py
+++ b/yolov5processor/models/yolo.py
--- a/yolov5processor/models/yolov5l.yaml
+++ b/yolov5processor/models/yolov5l.yaml
--- a/yolov5processor/models/yolov5m.yaml
+++ b/yolov5processor/models/yolov5m.yaml
--- a/yolov5processor/models/yolov5n.yaml
+++ b/yolov5processor/models/yolov5n.yaml
--- a/yolov5processor/models/yolov5s.yaml
+++ b/yolov5processor/models/yolov5s.yaml
--- a/yolov5processor/models/yolov5x.yaml
+++ b/yolov5processor/models/yolov5x.yaml
--- a/yolov5processor/utils/__init__.py
+++ b/yolov5processor/utils/__init__.py
--- a/yolov5processor/utils/activations.py
+++ b/yolov5processor/utils/activations.py
--- a/yolov5processor/utils/augmentations.py
+++ b/yolov5processor/utils/augmentations.py
--- a/yolov5processor/utils/autoanchor.py
+++ b/yolov5processor/utils/autoanchor.py
--- a/yolov5processor/utils/aws/__init__.py
+++ b/yolov5processor/utils/aws/__init__.py
--- a/yolov5processor/utils/aws/mime.sh
+++ b/yolov5processor/utils/aws/mime.sh
--- a/yolov5processor/utils/aws/resume.py
+++ b/yolov5processor/utils/aws/resume.py
--- a/yolov5processor/utils/aws/userdata.sh
+++ b/yolov5processor/utils/aws/userdata.sh
--- a/yolov5processor/utils/callbacks.py
+++ b/yolov5processor/utils/callbacks.py
--- a/yolov5processor/utils/datasets.py
+++ b/yolov5processor/utils/datasets.py
--- a/yolov5processor/utils/downloads.py
+++ b/yolov5processor/utils/downloads.py
--- a/yolov5processor/utils/flask_rest_api/README.md
+++ b/yolov5processor/utils/flask_rest_api/README.md
--- a/yolov5processor/utils/flask_rest_api/example_request.py
+++ b/yolov5processor/utils/flask_rest_api/example_request.py
--- a/yolov5processor/utils/flask_rest_api/restapi.py
+++ b/yolov5processor/utils/flask_rest_api/restapi.py
--- a/yolov5processor/utils/general.py
+++ b/yolov5processor/utils/general.py
--- a/yolov5processor/utils/google_app_engine/Dockerfile
+++ b/yolov5processor/utils/google_app_engine/Dockerfile
--- a/yolov5processor/utils/google_app_engine/additional_requirements.txt
+++ b/yolov5processor/utils/google_app_engine/additional_requirements.txt
--- a/yolov5processor/utils/google_app_engine/app.yaml
+++ b/yolov5processor/utils/google_app_engine/app.yaml
--- a/yolov5processor/utils/loggers/__init__.py
+++ b/yolov5processor/utils/loggers/__init__.py
--- a/yolov5processor/utils/loggers/wandb/README.md
+++ b/yolov5processor/utils/loggers/wandb/README.md
--- a/yolov5processor/utils/loggers/wandb/__init__.py
+++ b/yolov5processor/utils/loggers/wandb/__init__.py
--- a/yolov5processor/utils/loggers/wandb/log_dataset.py
+++ b/yolov5processor/utils/loggers/wandb/log_dataset.py
--- a/yolov5processor/utils/loggers/wandb/sweep.py
+++ b/yolov5processor/utils/loggers/wandb/sweep.py
--- a/yolov5processor/utils/loggers/wandb/sweep.yaml
+++ b/yolov5processor/utils/loggers/wandb/sweep.yaml
--- a/yolov5processor/utils/loggers/wandb/wandb_utils.py
+++ b/yolov5processor/utils/loggers/wandb/wandb_utils.py
--- a/yolov5processor/utils/loss.py
+++ b/yolov5processor/utils/loss.py
--- a/yolov5processor/utils/metrics.py
+++ b/yolov5processor/utils/metrics.py
--- a/yolov5processor/utils/plots.py
+++ b/yolov5processor/utils/plots.py
--- a/yolov5processor/utils/torch_utils.py
+++ b/yolov5processor/utils/torch_utils.py