sakt.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class sakt(nn.Module):
    def __init__(self, ex_total, seq_len, dim, heads, dout):
        super(sakt, self).__init__()
        self.seq_len = seq_len
        self.dim = dim

        self.embd_in = nn.Embedding(2 * ex_total + 1, embedding_dim=dim)  # Interaction embedding
        self.embd_ex = nn.Embedding(ex_total + 1, embedding_dim=dim)  # Excercise embedding
        self.embd_pos = nn.Embedding(seq_len, embedding_dim=dim)

        self.linear = nn.ModuleList(
            [nn.Linear(in_features=dim, out_features=dim) for x in range(3)])  # Linear projection for each embedding
        self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=heads, dropout=dout)
        self.ffn = nn.ModuleList([nn.Linear(in_features=dim, out_features=d, bias=True) for x in
                                  range(2)])  # feed forward layers post attention

        self.linear_out = nn.Linear(in_features=dim, out_features=1, bias=True)
        self.layer_norm1 = nn.LayerNorm(dim)
        self.layer_norm2 = nn.LayerNorm(dim)  # output with correctnness prediction
        self.drop = nn.Dropout(dout)

    def forward(self, input_in, input_ex):
        ## positional embedding
        pos_in = self.embd_pos(torch.arange(self.seq_len).unsqueeze(
            0))  # making a tensor of 12 numbers, .unsqueeze(0) for converting to 2d, so as to get a 3d output #print('pos embd' , pos_in.shape)

        ## get the interaction embedding output
        out_in = self.embd_in(input_in)  # (b, n) --> (b,n,d)
        out_in = out_in + pos_in

        ## split the interaction embeding into v and k ( needs to verify if it is slpited or not)
        value_in = out_in
        key_in = out_in  # print('v,k ', value_in.shape)

        ## get the excercise embedding output
        query_ex = self.embd_ex(input_ex)  # (b,n) --> (b,n,d) #print(query_ex.shape)

        ## Linearly project all the embedings
        value_in = self.linear[0](value_in).permute(1, 0, 2)  # (b,n,d) --> (n,b,d)
        key_in = self.linear[1](key_in).permute(1, 0, 2)
        query_ex = self.linear[2](query_ex).permute(1, 0, 2)

        ## pass through multihead attention
        atn_out, _ = self.attn(query_ex, key_in, value_in, attn_mask=torch.from_numpy(
            np.triu(np.ones((self.seq_len, self.seq_len)), k=1).astype(
                'bool')))  # lower triangular mask, bool, torch    (n,b,d)
        atn_out = query_ex + atn_out  # Residual connection ; added excercise embd as residual because previous ex may have imp info, suggested in paper.
        atn_out = self.layer_norm1(
            atn_out)  # Layer norm                        #print('atn',atn_out.shape) #n,b,d = atn_out.shape

        # take batch on first axis
        atn_out = atn_out.permute(1, 0, 2)  # (n,b,d) --> (b,n,d)

        ## FFN 2 layers
        ffn_out = self.drop(self.ffn[1](nn.ReLU()(self.ffn[0](
            atn_out))))  # (n,b,d) -->    .view([n*b ,d]) is not needed according to the kaggle implementation
        ffn_out = self.layer_norm2(ffn_out + atn_out)  # Layer norm and Residual connection

        ## sigmoid
        ffn_out = torch.sigmoid(self.linear_out(ffn_out))

        return ffn_out


def randomdata():
    input_in = torch.randint(0, 49, (64, 12))
    return input_in, input_in


## Testing the model
E = 50  # total unique excercises
d = 128  # latent dimension
n = 12  # sequence length

d1, d2 = randomdata()

print('Input shape', d1.shape)
model = sakt(ex_total=E, seq_len=n, dim=d, heads=8, dout=0.2)
out = model(d1, d2)
print('Output shape', out.shape)