Source code for pytorchvideo.data.domsev

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import math
from dataclasses import dataclass, fields as dataclass_fields
from typing import Any, Callable, Dict, List, Optional, Tuple

import torch
from pytorchvideo.data.dataset_manifest_utils import (
    EncodedVideoInfo,
    VideoClipInfo,
    VideoDataset,
    VideoDatasetType,
    VideoInfo,
)
from pytorchvideo.data.utils import DataclassFieldCaster, load_dataclass_dict_from_csv
from pytorchvideo.data.video import Video


USER_SCENE_MAP = {
    0: "none",
    1: "indoor",
    2: "nature",
    3: "crowded_environment",
    4: "urban",
}

USER_ACTIVITY_MAP = {
    0: "none",
    1: "walking",
    2: "running",
    3: "standing",
    4: "biking",
    5: "driving",
    6: "playing",
    7: "cooking",
    8: "eating",
    9: "observing",
    10: "in_conversation",
    11: "browsing",
    12: "shopping",
}

USER_ATTENTION_MAP = {
    0: "none",
    1: "paying_attention",
    2: "interacting",
}


[docs]@dataclass
class ActivityData(DataclassFieldCaster):
    """
    Class representing a contiguous activity video segment from the DoMSEV dataset.
    """

    video_id: str
    start_time: float  # Start time of the activity, in seconds
    stop_time: float  # Stop time of the activity, in seconds
    start_frame: int  # 0-indexed ID of the start frame (inclusive)
    stop_frame: int  # 0-index ID of the stop frame (inclusive)
    activity_id: int
    activity_name: str


# Utility functions
[docs]def seconds_to_frame_index(
    time_in_seconds: float, fps: int, zero_indexed: Optional[bool] = True
) -> int:
    """Converts a point in time (in seconds) within a video clip to its closest
    frame indexed (rounding down), based on a specified frame rate.

    Args:
        time_in_seconds (float): The point in time within the video.
        fps (int): The frame rate (frames per second) of the video.
        zero_indexed (Optional[bool]): Whether the returned frame should be
            zero-indexed (if True) or one-indexed (if False).

    Returns:
        (int) The index of the nearest frame (rounding down to the nearest integer).
    """
    frame_idx = math.floor(time_in_seconds * fps)
    if not zero_indexed:
        frame_idx += 1
    return frame_idx


[docs]def frame_index_to_seconds(
    frame_index: int, fps: int, zero_indexed: Optional[bool] = True
) -> float:
    """Converts a frame index within a video clip to the corresponding
    point in time (in seconds) within the video, based on a specified frame rate.

    Args:
        frame_index (int): The index of the frame within the video.
        fps (int): The frame rate (frames per second) of the video.
        zero_indexed (Optional[bool]): Whether the specified frame is zero-indexed
            (if True) or one-indexed (if False).

    Returns:
        (float) The point in time within the video.
    """
    if not zero_indexed:
        frame_index -= 1
    time_in_seconds = frame_index / fps
    return time_in_seconds


[docs]def get_overlap_for_time_range_pair(
    t1_start: float, t1_stop: float, t2_start: float, t2_stop: float
) -> Optional[Tuple[float, float]]:
    """Calculates the overlap between two time ranges, if one exists.

    Returns:
        (Optional[Tuple]) A tuple of <overlap_start_time, overlap_stop_time> if
        an overlap is found, or None otherwise.
    """
    # Check if there is an overlap
    if (t1_start <= t2_stop) and (t2_start <= t1_stop):
        # Calculate the overlap period
        overlap_start_time = max(t1_start, t2_start)
        overlap_stop_time = min(t1_stop, t2_stop)
        return (overlap_start_time, overlap_stop_time)
    else:
        return None


[docs]class DomsevDataset(torch.utils.data.Dataset):
    """
    Egocentric activity classification video dataset for DoMSEV stored as
    an encoded video (with frame-level labels).
    <https://www.verlab.dcc.ufmg.br/semantic-hyperlapse/cvpr2018-dataset/>

    This dataset handles the loading, decoding, and configurable clip
    sampling for the videos.
    """

    def __init__(
        self,
        video_data_manifest_file_path: str,
        video_info_file_path: str,
        activities_file_path: str,
        clip_sampler: Callable[
            [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo]
        ],
        dataset_type: VideoDatasetType = VideoDatasetType.Frame,
        frames_per_second: int = 1,
        transform: Optional[Callable[[Dict[str, Any]], Any]] = None,
        frame_filter: Optional[Callable[[List[int]], List[int]]] = None,
        multithreaded_io: bool = False,
    ) -> None:
        f"""
        Args:
            video_data_manifest_file_path (str):
                The path to a json file outlining the available video data for the
                associated videos.  File must be a csv (w/header) with columns:
                {[f.name for f in dataclass_fields(EncodedVideoInfo)]}

                To generate this file from a directory of video frames, see helper
                functions in Module: pytorchvideo.data.domsev.utils

            video_info_file_path (str):
                Path or URI to manifest with basic metadata of each video.
                File must be a csv (w/header) with columns:
                {[f.name for f in dataclass_fields(VideoInfo)]}

            activities_file_path (str):
                Path or URI to manifest with activity annotations for each video.
                File must be a csv (w/header) with columns:
                {[f.name for f in dataclass_fields(ActivityData)]}

            clip_sampler: Callable[
                [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo]
            ],

            dataset_type (VideoDatasetType): The dataformat in which dataset
                video data is store (e.g. video frames, encoded video etc).

            frames_per_second (int): The FPS of the stored videos. (NOTE:
                this is variable and may be different than the original FPS
                reported on the DoMSEV dataset website -- it depends on the
                subsampling and frame extraction done internally at Facebook).

            transform (Optional[Callable[[Dict[str, Any]], Any]]):
                This callable is evaluated on the clip output before the clip is returned.
                It can be used for user-defined preprocessing and augmentations to the clips.

                    The clip input is a dictionary with the following format:
                        {{
                            'video': <video_tensor>,
                            'audio': <audio_tensor>,
                            'activities': <activities_tensor>,
                            'start_time': <float>,
                            'stop_time': <float>
                        }}

                If transform is None, the raw clip output in the above format is
                returned unmodified.

            frame_filter (Optional[Callable[[List[int]], List[int]]]):
                This callable is evaluated on the set of available frame inidices to be
                included in a sampled clip. This can be used to subselect frames within
                a clip to be loaded.

            multithreaded_io (bool):
                Boolean to control whether parllelizable io operations are performed across
                multiple threads.
        """
        assert video_info_file_path
        assert activities_file_path
        assert video_data_manifest_file_path

        # Populate video and metadata data providers
        self._videos: Dict[str, Video] = VideoDataset._load_videos(
            video_data_manifest_file_path,
            video_info_file_path,
            multithreaded_io,
            dataset_type,
        )

        self._activities: Dict[str, List[ActivityData]] = load_dataclass_dict_from_csv(
            activities_file_path, ActivityData, "video_id", list_per_key=True
        )

        # Sample datapoints
        self._clips: List[VideoClipInfo] = clip_sampler(self._videos, self._activities)

        self._frames_per_second = frames_per_second
        self._user_transform = transform
        self._transform = self._transform_clip
        self._frame_filter = frame_filter

[docs]    def __getitem__(self, index) -> Dict[str, Any]:
        """
        Samples a video clip associated to the given index.

        Args:
            index (int): index for the video clip.

        Returns:
            A video clip with the following format if transform is None:
                {{
                    'video_id': <str>,
                    'video': <video_tensor>,
                    'audio': <audio_tensor>,
                    'activities': <activities_tensor>,
                    'start_time': <float>,
                    'stop_time': <float>
                }}
            Otherwise, the transform defines the clip output.
        """
        clip = self._clips[index]

        # Filter activities by only the ones that appear within the clip boundaries,
        # and unpack the activities so there is one per frame in the clip
        activities_in_video = self._activities[clip.video_id]
        activities_in_clip = []
        for activity in activities_in_video:
            overlap_period = get_overlap_for_time_range_pair(
                clip.start_time, clip.stop_time, activity.start_time, activity.stop_time
            )
            if overlap_period is not None:
                overlap_start_time, overlap_stop_time = overlap_period

                # Convert the overlapping period between clip and activity to
                # 0-indexed start and stop frame indexes, so we can unpack 1
                # activity label per frame.
                overlap_start_frame = seconds_to_frame_index(
                    overlap_start_time, self._frames_per_second
                )
                overlap_stop_frame = seconds_to_frame_index(
                    overlap_stop_time, self._frames_per_second
                )

                # Append 1 activity label per frame
                for _ in range(overlap_start_frame, overlap_stop_frame):
                    activities_in_clip.append(activity)

        # Convert the list of ActivityData objects to a tensor of just the activity class IDs
        activity_class_ids = [
            activities_in_clip[i].activity_id for i in range(len(activities_in_clip))
        ]
        activity_class_ids_tensor = torch.tensor(activity_class_ids)

        clip_data = {
            "video_id": clip.video_id,
            **self._videos[clip.video_id].get_clip(clip.start_time, clip.stop_time),
            "activities": activity_class_ids_tensor,
            "start_time": clip.start_time,
            "stop_time": clip.stop_time,
        }

        if self._transform:
            clip_data = self._transform(clip_data)

        return clip_data

[docs]    def __len__(self) -> int:
        """
        Returns:
            The number of video clips in the dataset.
        """
        return len(self._clips)

    def _transform_clip(self, clip: Dict[str, Any]) -> Dict[str, Any]:
        """Transforms a given video clip, according to some pre-defined transforms
        and an optional user transform function (self._user_transform).

        Args:
            clip (Dict[str, Any]): The clip that will be transformed.

        Returns:
            (Dict[str, Any]) The transformed clip.
        """
        for key in clip:
            if clip[key] is None:
                clip[key] = torch.tensor([])

        if self._user_transform:
            clip = self._user_transform(clip)

        return clip