Transformer model implemented with Pytorch

Overview

transformer-pytorch

Transformer model implemented with Pytorch

Attention is all you need-[Paper]

Architecture

Transformer


Self-Attention

Attention

self_attention.py

[N, len, heads, head_dim] values = values.reshape(N, value_len, self.heads, self.head_dim) keys = keys.reshape(N, key_len, self.heads, self.head_dim) queries = queries.reshape(N, query_len, self.heads, self.head_dim) # Einsum does matrix mult. for query*keys for each training example # with every other training example, don't be confused by einsum # it's just how I like doing matrix multiplication & bmm energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) # queries shape: (N, query_len, heads, heads_dim), # keys shape: (N, key_len, heads, heads_dim) # energy: (N, heads, query_len, key_len) # Mask padded indices so their weights become 0 if mask is not None: energy = energy.masked_fill(mask == 0, float("-1e20")) # Normalize energy values similarly to seq2seq + attention # so that they sum to 1. Also divide by scaling factor for # better stability attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3) # attention shape: (N, heads, query_len, key_len) out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape( N, query_len, self.heads * self.head_dim ) # attention shape: (N, heads, query_len, key_len) # values shape: (N, value_len, heads, heads_dim) # out after matrix multiply: (N, query_len, heads, head_dim), then # we reshape and flatten the last two dimensions. out = self.fc_out(out) # Linear layer doesn't modify the shape, final shape will be # (N, query_len, embed_size) return out ">
 class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads      = heads
        self.head_dim   = embed_size // heads

        assert (
                self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values  = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.keys    = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.fc_out  = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values  = self.values(values)
        keys    = self.keys(keys)
        queries = self.queries(query)
        
        # Split the embedding into self.heads different pieces
        # Multi head
        # [N, len, embed_size] --> [N, len, heads, head_dim]
        values    = values.reshape(N, value_len, self.heads, self.head_dim)
        keys      = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries   = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

Encoder Block

Encoder

encoder_block.py

class EncoderBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(EncoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1     = nn.LayerNorm(embed_size)
        self.norm2     = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x       = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out     = self.dropout(self.norm2(forward + x))
        return out

Encoder

Encoder

encoder.py

class Encoder(nn.Module):
    def __init__(
            self,
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size         = embed_size
        self.device             = device
        self.word_embedding     = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                EncoderBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

Decoder Block

DecoderBlock

docoder_block.py

class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm              = nn.LayerNorm(embed_size)
        self.attention         = SelfAttention(embed_size, heads=heads)
        self.transformer_block = EncoderBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout           = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query     = self.dropout(self.norm(attention + x))
        out       = self.transformer_block(value, key, query, src_mask)
        return out

Decoder

Decoder

decoder.py

class Decoder(nn.Module):
    def __init__(
            self,
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
    ):
        super(Decoder, self).__init__()
        self.device             = device
        self.word_embedding     = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        
        self.dropout = nn.Dropout(dropout)
        self.fc_out  = nn.Linear(embed_size, trg_vocab_size)


    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions     = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x             = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)
        return out

Transformer

transformer.py

class Transformer(nn.Module):
    def __init__(
            self,
            src_vocab_size,
            trg_vocab_size,
            src_pad_idx,
            trg_pad_idx,
            embed_size=512,
            num_layers=6,
            forward_expansion=4,
            heads=8,
            dropout=0,
            device="cpu",
            max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device      = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

Authors

Owner
Mingu Kang
SW Engineering / ML / DL / Blockchain Dept. of Software Engineering, Jeonbuk National University
Mingu Kang
Sequential GCN for Active Learning

Sequential GCN for Active Learning Please cite if using the code: Link to paper. Requirements: python 3.6+ torch 1.0+ pip libraries: tqdm, sklearn, sc

45 Dec 26, 2022
From Perceptron model to Deep Neural Network from scratch in Python.

Neural-Network-Basics Aim of this Repository: From Perceptron model to Deep Neural Network (from scratch) in Python. ** Currently working on a basic N

Aditya Kahol 1 Jan 14, 2022
Source code for the paper: Variance-Aware Machine Translation Test Sets (NeurIPS 2021 Datasets and Benchmarks Track)

Variance-Aware-MT-Test-Sets Variance-Aware Machine Translation Test Sets License See LICENSE. We follow the data licensing plan as the same as the WMT

NLP2CT Lab, University of Macau 5 Dec 21, 2021
A Simple Framwork for CV Pre-training Model (SOCO, VirTex, BEiT)

A Simple Framwork for CV Pre-training Model (SOCO, VirTex, BEiT)

Sense-GVT 14 Jul 07, 2022
PyTorch reimplementation of hand-biomechanical-constraints (ECCV2020)

Hand Biomechanical Constraints Pytorch Unofficial PyTorch reimplementation of Hand-Biomechanical-Constraints (ECCV2020). This project reimplement foll

Hao Meng 59 Dec 20, 2022
A parametric soroban written with CADQuery.

A parametric soroban written in CADQuery The purpose of this project is to demonstrate how "code CAD" can be intuitive to learn. See soroban.py for a

Lee 4 Aug 13, 2022
Full-featured Decision Trees and Random Forests learner.

CID3 This is a full-featured Decision Trees and Random Forests learner. It can save trees or forests to disk for later use. It is possible to query tr

Alejandro Penate-Diaz 3 Aug 15, 2022
EmoTag helps you train emotion detection model for Chinese audios

emoTag emoTag helps you train emotion detection model for Chinese audios. Environment pip install -r requirement.txt Data We used Emotional Speech Dat

_zza 4 Sep 07, 2022
Doing the asl sign language classification on static images using graph neural networks.

SignLangGNN When GNNs 💜 MediaPipe. This is a starter project where I tried to implement some traditional image classification problem i.e. the ASL si

10 Nov 09, 2022
A Pytorch implementation of "LegoNet: Efficient Convolutional Neural Networks with Lego Filters" (ICML 2019).

LegoNet This code is the implementation of ICML2019 paper LegoNet: Efficient Convolutional Neural Networks with Lego Filters Run python train.py You c

YangZhaohui 140 Sep 26, 2022
Learning Versatile Neural Architectures by Propagating Network Codes

Learning Versatile Neural Architectures by Propagating Network Codes Mingyu Ding, Yuqi Huo, Haoyu Lu, Linjie Yang, Zhe Wang, Zhiwu Lu, Jingdong Wang,

Mingyu Ding 36 Dec 06, 2022
Official pytorch implementation for Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion (CVPR 2022)

Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion This repository contains a pytorch implementation of "Learning to Listen: Modeling

50 Dec 17, 2022
A PyTorch version of You Only Look at One-level Feature object detector

PyTorch_YOLOF A PyTorch version of You Only Look at One-level Feature object detector. The input image must be resized to have their shorter side bein

Jianhua Yang 25 Dec 30, 2022
Trading and Backtesting environment for training reinforcement learning agent or simple rule base algo.

TradingGym TradingGym is a toolkit for training and backtesting the reinforcement learning algorithms. This was inspired by OpenAI Gym and imitated th

Yvictor 1.1k Jan 02, 2023
Deep Learning Package based on TensorFlow

White-Box-Layer is a Python module for deep learning built on top of TensorFlow and is distributed under the MIT license. The project was started in M

YeongHyeon Park 7 Dec 27, 2021
A framework for Quantification written in Python

QuaPy QuaPy is an open source framework for quantification (a.k.a. supervised prevalence estimation, or learning to quantify) written in Python. QuaPy

41 Dec 14, 2022
🎯 A comprehensive gradient-free optimization framework written in Python

Solid is a Python framework for gradient-free optimization. It contains basic versions of many of the most common optimization algorithms that do not

Devin Soni 565 Dec 26, 2022
ContourletNet: A Generalized Rain Removal Architecture Using Multi-Direction Hierarchical Representation

ContourletNet: A Generalized Rain Removal Architecture Using Multi-Direction Hierarchical Representation (Accepted by BMVC'21) Abstract: Images acquir

10 Dec 08, 2022
Digan - Official PyTorch implementation of Generating Videos with Dynamics-aware Implicit Generative Adversarial Networks

DIGAN (ICLR 2022) Official PyTorch implementation of "Generating Videos with Dyn

Sihyun Yu 147 Dec 31, 2022
Detector for Log4Shell exploitation attempts

log4shell-detector Detector for Log4Shell exploitation attempts Idea The problem with the log4j CVE-2021-44228 exploitation is that the string can be

Florian Roth 729 Dec 25, 2022