-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodeling_attention.py
160 lines (134 loc) · 6.74 KB
/
modeling_attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# coding=utf-8
# Copyright 2022 Vladislav Lialin and Namrata Shivagunde
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class SelfAttention(nn.Module):
def __init__(self, input_size, hidden):
"""Self-attention module which computes softmax(xQ @ xK^T) @ xV
Args:
input_size: int, size of input vectors
hidden: int, size of output vectors and hidden states
"""
# Task 1.1: Create layers requires for self-attention (1 point)
# YOUR CODE STARTS HERE (~4 lines)
super(SelfAttention, self).__init__()
self.k = nn.Linear(input_size, hidden)
self.q = nn.Linear(input_size, hidden)
self.v = nn.Linear(input_size, hidden)
self.dk = math.sqrt(hidden)
# YOUR CODE ENDS HERE
def forward(self, x):
"""Softmax(xQ @ xK^T) @ xV
Args:
x: FloatTensor[batch_size, seq_len, input_size]
Returns:
FloatTensor[batch_size, seq_len, hidden]
"""
# Task 1.2: Compute Self Attention (3 points)
# 1. Compute key, query and value matrices from your input x
# 2. Compute the scores using query and key matrices
# 3. Compute probabilities using softmax and scale the scores using
# 4. Compute the output using probabilities and value matrices
#
# Write shape of each tensor for each line of code
# for example:
# Suppose batch_size = 3 and seq_len = 5
# x = torch.zeros(3, 5) # shape [batch_size, seq_len]
# x = x.unqueeze(1) # shape [batch_size, 1, seq_len]
#
# NOTE: Remmenber that we work with batches of data [batch_size, seq_len, hidden],
# not just single examples [seq_len, hidden] as we did in the lecture. This changes your shapes a bit.
#
# YOUR CODE STARTS HERE (~ can be implemented in 4 lines or 3 if you combine steps 2 and 3 into one operation)
key = self.k(x) # [batch_size, seq_len, hidden]
query = self.q(x) # [batch_size, seq_len, hidden]
value = self.v(x) # [batch_size, seq_len, hidden]
scores = torch.matmul(query, key.transpose(1, 2)) # [batch_size, seq_len, seq_len]
probabilities = F.softmax(scores / self.dk, dim=-1) # [batch_size, seq_len, seq_len]
output = torch.matmul(probabilities, value) # [batch_size, seq_len, hidden]
return output
# YOUR CODE ENDS HERE
class MultiHeadSelfAttention(nn.Module):
def __init__(self, input_size, hidden, num_heads, causal=False, dropout=0):
"""
Args:
input_size: int, size of input vectors
hidden: int, size of output vectors and hidden states
num_heads: int, number of attention heads, should be a divisor of hidden
causal: use causal masking (do not allow queires to look to the keys that correspond to the future tokens)
"""
if hidden % num_heads:
raise ValueError(f"hidden should be divisible by num_heads, "
f"but got hidden={hidden} and num_heads={num_heads}")
super().__init__()
self.k = nn.Linear(input_size, hidden)
self.q = nn.Linear(input_size, hidden)
self.v = nn.Linear(input_size, hidden)
self.mix = nn.Linear(hidden, hidden)
self.dropout = nn.Dropout(dropout)# not to forget at scores calculation
self.num_heads = num_heads
self.head_size = hidden // num_heads
self.scale = self.head_size ** 0.5
self.causal = causal # causal masking
def forward(self, x, return_attention=False):
"""Computes [Softmax(x Q_1 @ x K_1^T) @ x V_1 : ... : Softmax(x Q_heads @ x K_heads^T) @ x V_heads] @ U
Args:
x: FloatTensor[batch_size, seq_len, input_size]
Returns:
FloatTensor[batch_size, seq_len, hidden]
if return_attention is True, returns also FloatTensor[batch_size * num_heads, seq_len, seq_len]
"""
bs, seq, _ = x.shape
# Task 2.1 (3 points)
# YOUR CODE STARTS HERE (Our implementation is in 3 lines, one for each for k, q and v)
key = self.k(x) # [batch_size, seq_len, hidden]
query = self.q(x) # [batch_size, seq_len, hidden]
value = self.v(x) # [batch_size, seq_len, hidden]
#now I need to slice them. Torch.Tensor.view ~~ numpy.reshape
#https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
k = key.view(bs, -1, self.num_heads, self.head_size).transpose(1, 2) # (batch_size, num_heads, hidden, head_size)
q = query.view(bs, -1, self.num_heads, self.head_size).transpose(1, 2) # (batch_size, num_heads, hidden, head_size)
v = value.view(bs, -1, self.num_heads, self.head_size).transpose(1, 2) # (batch_size, num_heads, hidden, head_size)
#calculate scores
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_size) # (batch_size, num_heads, hidden, hidden)
# YOUR CODE ENDS HERE
if self.causal:
# Task 2.2 (1 point)
# Apply casual mask to the scores
# YOUR CODE STARTS HERE (Our implementation is in 2 lines)
#mask also should be shaped as scores
#(batch_size, num_heads, hidden, hidden) * torch.triu by default diagonal
upper_triangular = torch.triu(torch.ones(seq, seq), diagonal=1)
mask = upper_triangular.bool()
mask = mask.expand_as(scores)
#update scores
scores = scores.masked_fill_(mask == 0, -float("inf"))
# YOUR CODE ENDS HERE
# Task 2.3 (2 points)
# Compute probability (probs) and attention (att), remember to apply mixing matrix
# YOUR CODE STARTS HERE (can be implemented in 4 lines)
probs = nn.Softmax(dim=-1)(scores)
#if self.dropout ==1:
probs = self.dropout(probs)
att = torch.matmul(probs,v)# that is wrong shape
#we need to convert (batch_size,num_heads,..) -> (batch_size * num_heads,..)
att = att.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.head_size)
# YOUR CODE ENDS HERE
if return_attention:
return att, probs
return att