Transformer-From-Scratch

This repository contains a Python implementation of the Transformer model introduced in the paper "Attention is All You Need". The implementation is designed to be lightweight, with tuned-down hyperparameters for quick experimentation. For the original hyperparameters used in the paper, please refer to Attention is All You Need Paper

Input embeddings

class InputEmbeddings(nn.Module):

    def __init__(self,d_model,vocab_size):

        super().__init__()
        self.d_model=d_model
        self.vocab_size=vocab_size
        self.Embeddings=nn.Embedding(self.vocab_size, self.d_model)

    def forward(self, x):
        return self.Embeddings(x) * math.sqrt(self.d_model)

Positional encodings

class PostinalEncoding(nn.Module):

    def __init__(self,seq_length,d_model,dropout):

        super().__init__()

        self.seq_length=seq_length
        self.d_model=d_model
        self.dropout=dropout
        self.Dropout=nn.Dropout(dropout)
        
        #create a tensor of size (seq_length,d_model)
        pe=torch.zeros(seq_length,d_model)
        
        #exapnde its dim 0, that is making a tensor of shape (seq_length,1)
        postion=torch.arrange(0,seq_length,dtype=torch.float()).unsqueeze(1)

        #calculating the denominator term
        denom= torch.exp( torch.arrange(0,self.d_model,2).float() * (-1) * math.log( 10000 ) / d_model)


        pe[:,0::2]= torch.sin(postion*denom)

        pe[:,1::2]= torch.cos(postion*denom)

        #considering first dimension for batch size
        pe.unsqueeze(0)

        self.register_buffer('pe',pe)

    def forward(self,x):
    
        x=x+ ( self.pe[:,:x.shape[1],:] ).requires_grad_(False)
        return self.dropout(x)

Layer Norm

The mean and standard deviation are calculated over the last D dimensions of the input tensor, where D is the dimension of normalized_shape in our case d_model. That is this will take the last 512 (if d_model=512) values, calculate their mean and variance, and then for each value they will subtract the the mean and divide by std deviation.

class LayerNorm(nn.Module):

    def __init__(self,d_model):
    
        super().__init__()
        self.d_model=d_model
        self.layernorm=nn.LayerNorm(self.d_model)

    def forward(self,x):
        return self.layernorm(x)

Feed forward layer- Goal of this layer- batch,sq_length,d_model) --> (batch,sq_length,d_ff) --> relu --> dropout --> (batch,sq_length,d_model)

class FeedForwad(nn.Module):

    def __init__(self,d_model,d_ff,dropout):

        super().__init__()
        self.d_model=d_model
        self.dff=d_ff  #dimension of feed forward layer
        self.dropout=dropout

        self.l1=nn.Linear(d_model,d_ff)
        self.l2=nn.Linear(d_ff,d_model)
        self.dropout=nn.Dropout(dropout)

    def forward(self,x):
	  #(batch,sq_length,d_model) --> (batch,sq_length,d_ff) --> relu --> dropout -->   (batch,sq_length,d_model)
        return  self.l2( self.dropout( torch.relu( self.l1(x) ) ) )

MultiHead Attention-

class MultiHeadAttension(nn.Module):

    def __init__(self,d_model,dropout):

        super().__init__()
        self.d_model=d_model
        self.dropout=dropout
  
        self.w_q=nn.Linear(d_model,d_model)
        self.w_k=nn.Linear(d_model,d_model)
        self.w_v=nn.Linear(d_model,d_model)
        self.attentionlayer=nn.MultiheadAttention(d_model,2,dropout)

    def forward(self,x,mask):

        query=self.w_q(x)
        key=self.w_k(x)
        value=self.w_v(x)
        attn_output, attn_output_weights= self.attentionlayer.forward(query,key,value,mask,self.dropout) ##IMPT

        return attn_output,attn_output_weights

Residual connections-

class ResidualConnection(nn.Module):

    def __init___(self,dropout):

        super().__init__()
        self.dropout=dropout
        self.normlayer=LayerNorm()

    def forward(self,x , sublayer):
        return x + self.dropout( sublayer( self.normlayer(x) ) )

Encoder block-

class EncoderBlock(nn.Module):

    def __init__(self, feed_forward : FeedForwad, self_multihead_attention : MultiHeadAttension, dropout : float):
    
        super().__init__()
        self.feed_forward=feed_forward
        self.self_multihead_attention=self_multihead_attention
        self.dropout=dropout
        self.residuallayers=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self,x,mask ):

        x= self.residuallayers[0](x, lambda x: self.self_multihead_attention(x,mask)[0] ) #  # self_multihead_attention this will return a tuple,We want just he first output

Encoder - This will be N repetitions of encoder block,

class Encoder(nn.Module):

    def __init__(self, layers : nn.ModuleList):

        super().__init__()
        self.layers=layers
        self.normlayer=LayerNorm()

    def forward(self,x , mask):

        for layer in self.layers:
            x=layer(x,mask)
        return self.normlayer(x)

Whenever we want to repeat a class, we can use nn.ModuleList and then iterate through it in the forward function.

Decoder block and decoder- Same as encoder, This has some changes in its attention block.

class DecoderBlock(nn.Module):

    def __init__(self, self_attension : MultiHeadAttension, cross_attension : MultiHeadAttension, feedforward : FeedForwad , dropout ):

        super().__init__()
        self.self_attension=self_attension
        self.cross_attension=cross_attension
        self.feedforwad=feedforward
        self.dropout=dropout
        self.residuallayers=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self,x,encoder_output,src_mask,tgt_mask):  # src_mask= source mask that is mask of encoder, tgt_mask=targter mask,that is mask of deocder
        x=self.residuallayers[0](x, lambda x: self.self_attension(x,x,x,tgt_mask))
        x=self.residuallayers[1](x,lambda x: self.cross_attension(x,encoder_output,encoder_output,src_mask))
        x=self.residuallayers[2](x, self.feedforwad)
        return x

class Decoder(nn.Module):

    def __init__(self,layers: nn.ModuleList):
    
        super().__init__()
        self.layers=layers
        self.normlayer=LayerNorm()

    def forward(self,x,encoder_output,src_mask,tgt_mask):

        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.normlayer(x)

These are all the parts needed to build a transformer, but one final step is remaining which will convert this to words, for this we need to convert this to embeddings, hence next feed forward layer will do this, as this will project our output to the embeddings, this is called projection layer.

class ProjectionLayer(nn.Module):

    def __init__(self,d_model,vocab_size):

        super().__init__()
        self.d_model=d_model
        self.vocab_size=vocab_size
        self.proj=nn.Linear(d_model,vocab_size)

    def forward(self,x):
        # (batch_size,seq_length,d_model) --> (batch_size,seq_length,vocab_size)
        return   torch.log_softmax( self.proj(x), dim=-1 )

Transformer block- This will take the input and output embeddings along with input and output positional encodings, then calculate the output from the positional embeddings and pass it to encoder and decoder.

class Transformer(nn.Module):
    def __init__(self, src_emb: InputEmbeddings , tgt_emb: InputEmbeddings, src_pos: PostinalEncoding, tgt_pos: PostinalEncoding , encoder: Encoder, decoder: Decoder, proj_layer: ProjectionLayer ):

        super().__init__()
        self.src_emb=src_emb
        self.tgt_emb=tgt_emb
        self.src_pos=src_pos
        self.tgt_pos=tgt_pos
        self.encoder=encoder
        self.decoder=decoder
        self.proj_layer=proj_layer

    def encode(self,src,src_mask):

        src=self.src_emb(src)
        src=self.src_pos(src)
        return self.encoder(src,src_mask)

    def decode(self,encoder_output,tgt,src_mask,tgt_mask):

        tgt= self.tgt_emb(tgt)
        tgt=self.tgt_pos(tgt)
        return self.decoder(tgt,encoder_output,src_mask,tgt_mask)

    def project(self,x):
        return self.proj_layer(x)

Build function-

def BuildTrasnformer(src_vocab_szie , src_seq_length , tgt_vocab_size , tgt_seq_length , d_model=512 , d_ff=2048 , dropout=0.1 , num_heads=2 ,N=2):

    #N is number of encoder/decoder blocks
    src_emb=InputEmbeddings(d_model,src_vocab_szie)
    tgt_emb=InputEmbeddings(d_model,tgt_vocab_size)

    src_pos=PostinalEncoding(src_seq_length,d_model,dropout)
    tgt_pos=PostinalEncoding(tgt_seq_length,d_model,dropout)

    # making N encoders
    encoder_blocks=[]
    for _ in range(N):

        feedforwardlayer=FeedForwad(d_model,d_ff,dropout)
        encoder_atten_head=MultiHeadAttension(d_model,num_heads,dropout)
        encoder_block=EncoderBlock(feedforwardlayer,encoder_atten_head,dropout)
        encoder_blocks.append(encoder_block)

    # making N decoders
    decoder_blocks=[]
    for _ in range(N):

        feedforwardlayer=FeedForwad(d_model,d_ff,dropout)
        decoder_self_atten=-MultiHeadAttension(d_model,num_heads,dropout)
        decoder_cross_atten=MultiHeadAttension(d_model,num_heads,dropout)
			decoder_block=DecoderBlock(decoder_self_atten,decoder_cross_atten,feedforwardlayer,dropout)
        decoder_blocks.append(decoder_block)
        
    #creating encoder and deocder
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    proj_layer=ProjectionLayer(d_model,tgt_vocab_size)

    transformer=Transformer(src_emb,tgt_emb,src_pos,tgt_pos,encoder,decoder,proj_layer)
    
    #initialise the transformer
    for p in transformer.parameters():

        if p.dim>1:
            nn.init.xavier_uniform(p)

    return transformer

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
README.md		README.md
model.py		model.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Transformer-From-Scratch

These are all the parts needed to build a transformer, but one final step is remaining which will convert this to words, for this we need to convert this to embeddings, hence next feed forward layer will do this, as this will project our output to the embeddings, this is called projection layer.

About

Releases

Packages

Languages

sankn123/Transformer-From-Scratch

Folders and files

Latest commit

History

Repository files navigation

Transformer-From-Scratch

These are all the parts needed to build a transformer, but one final step is remaining which will convert this to words, for this we need to convert this to embeddings, hence next feed forward layer will do this, as this will project our output to the embeddings, this is called projection layer.

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages