Source code for pytorchvideo.layers.positional_encoding
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
import torch
from torch import nn
[docs]class PositionalEncoding(nn.Module):
"""
Applies a positional encoding to a tensor with shape (batch_size x seq_len x embed_dim).
The positional encoding is computed as follows:
PE(pos,2i) = sin(pos/10000^(2i/dmodel))
PE(pos,2i+1) = cos(pos/10000^(2i/dmodel))
where pos = position, pos in [0, seq_len)
dmodel = data embedding dimension = embed_dim
i = dimension index, i in [0, embed_dim)
Reference: "Attention Is All You Need" https://arxiv.org/abs/1706.03762
Implementation Reference: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
"""
def __init__(self, embed_dim: int, seq_len: int = 1024) -> None:
super().__init__()
pe = torch.zeros(seq_len, embed_dim, dtype=torch.float)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, embed_dim, 2).float() * (-(math.log(10000.0)) / embed_dim)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x: torch.Tensor) -> torch.Tensor:
assert self.pe.size(1) >= x.size(1), (
"Cannot apply position encoding of size "
+ f"{self.pe.size()} when input has size {x.size()}"
)
return x + self.pe[:, : x.size(1), :]