Shortcuts

Source code for pytorchvideo.data.domsev

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import math
from dataclasses import dataclass, fields as dataclass_fields
from typing import Any, Callable, Dict, List, Optional, Tuple

import torch
from pytorchvideo.data.dataset_manifest_utils import (
    EncodedVideoInfo,
    VideoClipInfo,
    VideoDataset,
    VideoDatasetType,
    VideoInfo,
)
from pytorchvideo.data.utils import DataclassFieldCaster, load_dataclass_dict_from_csv
from pytorchvideo.data.video import Video


USER_SCENE_MAP = {
    0: "none",
    1: "indoor",
    2: "nature",
    3: "crowded_environment",
    4: "urban",
}

USER_ACTIVITY_MAP = {
    0: "none",
    1: "walking",
    2: "running",
    3: "standing",
    4: "biking",
    5: "driving",
    6: "playing",
    7: "cooking",
    8: "eating",
    9: "observing",
    10: "in_conversation",
    11: "browsing",
    12: "shopping",
}

USER_ATTENTION_MAP = {
    0: "none",
    1: "paying_attention",
    2: "interacting",
}


[docs]@dataclass class ActivityData(DataclassFieldCaster): """ Class representing a contiguous activity video segment from the DoMSEV dataset. """ video_id: str start_time: float # Start time of the activity, in seconds stop_time: float # Stop time of the activity, in seconds start_frame: int # 0-indexed ID of the start frame (inclusive) stop_frame: int # 0-index ID of the stop frame (inclusive) activity_id: int activity_name: str
# Utility functions
[docs]def seconds_to_frame_index( time_in_seconds: float, fps: int, zero_indexed: Optional[bool] = True ) -> int: """Converts a point in time (in seconds) within a video clip to its closest frame indexed (rounding down), based on a specified frame rate. Args: time_in_seconds (float): The point in time within the video. fps (int): The frame rate (frames per second) of the video. zero_indexed (Optional[bool]): Whether the returned frame should be zero-indexed (if True) or one-indexed (if False). Returns: (int) The index of the nearest frame (rounding down to the nearest integer). """ frame_idx = math.floor(time_in_seconds * fps) if not zero_indexed: frame_idx += 1 return frame_idx
[docs]def frame_index_to_seconds( frame_index: int, fps: int, zero_indexed: Optional[bool] = True ) -> float: """Converts a frame index within a video clip to the corresponding point in time (in seconds) within the video, based on a specified frame rate. Args: frame_index (int): The index of the frame within the video. fps (int): The frame rate (frames per second) of the video. zero_indexed (Optional[bool]): Whether the specified frame is zero-indexed (if True) or one-indexed (if False). Returns: (float) The point in time within the video. """ if not zero_indexed: frame_index -= 1 time_in_seconds = frame_index / fps return time_in_seconds
[docs]def get_overlap_for_time_range_pair( t1_start: float, t1_stop: float, t2_start: float, t2_stop: float ) -> Optional[Tuple[float, float]]: """Calculates the overlap between two time ranges, if one exists. Returns: (Optional[Tuple]) A tuple of <overlap_start_time, overlap_stop_time> if an overlap is found, or None otherwise. """ # Check if there is an overlap if (t1_start <= t2_stop) and (t2_start <= t1_stop): # Calculate the overlap period overlap_start_time = max(t1_start, t2_start) overlap_stop_time = min(t1_stop, t2_stop) return (overlap_start_time, overlap_stop_time) else: return None
[docs]class DomsevDataset(torch.utils.data.Dataset): """ Egocentric activity classification video dataset for DoMSEV stored as an encoded video (with frame-level labels). <https://www.verlab.dcc.ufmg.br/semantic-hyperlapse/cvpr2018-dataset/> This dataset handles the loading, decoding, and configurable clip sampling for the videos. """ def __init__( self, video_data_manifest_file_path: str, video_info_file_path: str, activities_file_path: str, clip_sampler: Callable[ [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo] ], dataset_type: VideoDatasetType = VideoDatasetType.Frame, frames_per_second: int = 1, transform: Optional[Callable[[Dict[str, Any]], Any]] = None, frame_filter: Optional[Callable[[List[int]], List[int]]] = None, multithreaded_io: bool = False, ) -> None: f""" Args: video_data_manifest_file_path (str): The path to a json file outlining the available video data for the associated videos. File must be a csv (w/header) with columns: {[f.name for f in dataclass_fields(EncodedVideoInfo)]} To generate this file from a directory of video frames, see helper functions in Module: pytorchvideo.data.domsev.utils video_info_file_path (str): Path or URI to manifest with basic metadata of each video. File must be a csv (w/header) with columns: {[f.name for f in dataclass_fields(VideoInfo)]} activities_file_path (str): Path or URI to manifest with activity annotations for each video. File must be a csv (w/header) with columns: {[f.name for f in dataclass_fields(ActivityData)]} clip_sampler: Callable[ [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo] ], dataset_type (VideoDatasetType): The dataformat in which dataset video data is store (e.g. video frames, encoded video etc). frames_per_second (int): The FPS of the stored videos. (NOTE: this is variable and may be different than the original FPS reported on the DoMSEV dataset website -- it depends on the subsampling and frame extraction done internally at Facebook). transform (Optional[Callable[[Dict[str, Any]], Any]]): This callable is evaluated on the clip output before the clip is returned. It can be used for user-defined preprocessing and augmentations to the clips. The clip input is a dictionary with the following format: {{ 'video': <video_tensor>, 'audio': <audio_tensor>, 'activities': <activities_tensor>, 'start_time': <float>, 'stop_time': <float> }} If transform is None, the raw clip output in the above format is returned unmodified. frame_filter (Optional[Callable[[List[int]], List[int]]]): This callable is evaluated on the set of available frame inidices to be included in a sampled clip. This can be used to subselect frames within a clip to be loaded. multithreaded_io (bool): Boolean to control whether parllelizable io operations are performed across multiple threads. """ assert video_info_file_path assert activities_file_path assert video_data_manifest_file_path # Populate video and metadata data providers self._videos: Dict[str, Video] = VideoDataset._load_videos( video_data_manifest_file_path, video_info_file_path, multithreaded_io, dataset_type, ) self._activities: Dict[str, List[ActivityData]] = load_dataclass_dict_from_csv( activities_file_path, ActivityData, "video_id", list_per_key=True ) # Sample datapoints self._clips: List[VideoClipInfo] = clip_sampler(self._videos, self._activities) self._frames_per_second = frames_per_second self._user_transform = transform self._transform = self._transform_clip self._frame_filter = frame_filter
[docs] def __getitem__(self, index) -> Dict[str, Any]: """ Samples a video clip associated to the given index. Args: index (int): index for the video clip. Returns: A video clip with the following format if transform is None: {{ 'video_id': <str>, 'video': <video_tensor>, 'audio': <audio_tensor>, 'activities': <activities_tensor>, 'start_time': <float>, 'stop_time': <float> }} Otherwise, the transform defines the clip output. """ clip = self._clips[index] # Filter activities by only the ones that appear within the clip boundaries, # and unpack the activities so there is one per frame in the clip activities_in_video = self._activities[clip.video_id] activities_in_clip = [] for activity in activities_in_video: overlap_period = get_overlap_for_time_range_pair( clip.start_time, clip.stop_time, activity.start_time, activity.stop_time ) if overlap_period is not None: overlap_start_time, overlap_stop_time = overlap_period # Convert the overlapping period between clip and activity to # 0-indexed start and stop frame indexes, so we can unpack 1 # activity label per frame. overlap_start_frame = seconds_to_frame_index( overlap_start_time, self._frames_per_second ) overlap_stop_frame = seconds_to_frame_index( overlap_stop_time, self._frames_per_second ) # Append 1 activity label per frame for _ in range(overlap_start_frame, overlap_stop_frame): activities_in_clip.append(activity) # Convert the list of ActivityData objects to a tensor of just the activity class IDs activity_class_ids = [ activities_in_clip[i].activity_id for i in range(len(activities_in_clip)) ] activity_class_ids_tensor = torch.tensor(activity_class_ids) clip_data = { "video_id": clip.video_id, **self._videos[clip.video_id].get_clip(clip.start_time, clip.stop_time), "activities": activity_class_ids_tensor, "start_time": clip.start_time, "stop_time": clip.stop_time, } if self._transform: clip_data = self._transform(clip_data) return clip_data
[docs] def __len__(self) -> int: """ Returns: The number of video clips in the dataset. """ return len(self._clips)
def _transform_clip(self, clip: Dict[str, Any]) -> Dict[str, Any]: """Transforms a given video clip, according to some pre-defined transforms and an optional user transform function (self._user_transform). Args: clip (Dict[str, Any]): The clip that will be transformed. Returns: (Dict[str, Any]) The transformed clip. """ for key in clip: if clip[key] is None: clip[key] = torch.tensor([]) if self._user_transform: clip = self._user_transform(clip) return clip