-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodules.py
128 lines (101 loc) · 3.88 KB
/
modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Modules used for the biaffine dependency parser
- BiAffineAttn Module
- MLP Module
- TimeDistributed wrapper
"""
import torch
from torch import nn
from torch.nn import functional
from torch.autograd import Variable
import numpy as np
class BiAffineAttn(nn.Module):
"""
BiAffine Attention layer from https://arxiv.org/abs/1611.01734
Expects inputs as batch-first sequences [batch_size, seq_length, dim].
Returns score matrices as [batch_size, dim, dim] for arc attention
(out_channels=1), and score as [batch_size, out_channels, dim, dim]
for label attention (where out_channels=#labels).
"""
def __init__(self, in_dim, out_channels, bias_head=True, bias_dep=True):
super(BiAffineAttn, self).__init__()
self.bias_head = bias_head
self.bias_dep = bias_dep
self.U = nn.Parameter(torch.Tensor(out_channels,
in_dim + int(bias_head),
in_dim + int(bias_dep)))
self.reset_parameters()
def reset_parameters(self):
stdv = 1. / np.sqrt(self.U.size(1))
self.U.data.uniform_(-stdv, stdv)
def forward(self, Rh, Rd):
"""
Returns S = (Rh @ U @ Rd.T) with dims [batchsize, n_channels, t, t]
S[b, c, i, j] = Score sample b Label c Head i Dep j
"""
if self.bias_head:
Rh = self.add_ones_col(Rh)
if self.bias_dep:
Rd = self.add_ones_col(Rd)
# Add dimension to Rh and Rd for batch matrix products,
# shape [batch, t, d] -> [batch, 1, t, d]
Rh = Rh.unsqueeze(1)
Rd = Rd.unsqueeze(1)
S = Rh @ self.U @ torch.transpose(Rd, -1, -2)
# If out_channels == 1, squeeze [batch, 1, t, t] -> [batch, t, t]
return S.squeeze(1)
@staticmethod
def add_ones_col(X):
"""
Add column of ones to each matrix in batch.
"""
b = Variable(torch.ones(X.data.shape[:-1]).type(type(X.data)))
return torch.cat([X, b], -1)
def __repr__(self):
tmpstr = self.__class__.__name__
tmpstr += '(\n (U): {}\n)'.format(self.U.size())
return tmpstr
class MLP(nn.Module):
"""
Module for an MLP with dropout.
"""
def __init__(self, input_size, layer_size, depth, activation, dropout):
super(MLP, self).__init__()
self.layers = nn.Sequential()
act_fn = getattr(nn, activation)
for i in range(depth):
self.layers.add_module('fc_{}'.format(i),
nn.Linear(input_size, layer_size))
if activation:
self.layers.add_module('{}_{}'.format(activation, i),
act_fn())
if dropout:
self.layers.add_module('dropout_{}'.format(i),
nn.Dropout(dropout))
input_size = layer_size
def forward(self, x):
return self.layers(x)
class TimeDistributed(nn.Module):
"""
Module that mimics Keras TimeDistributed
source: https://discuss.pytorch.org/t/any-pytorch-function-can-work-as-keras-timedistributed/1346/4
"""
def __init__(self, module, batch_first=True):
super(TimeDistributed, self).__init__()
self.module = module
self.batch_first = batch_first
def forward(self, x):
if len(x.size()) <= 2:
return self.module(x)
# Squash samples and timesteps into a single axis
# (samples * timesteps, input_size)
x_reshape = x.contiguous().view(-1, x.size(-1))
y = self.module(x_reshape)
# We have to reshape Y
if self.batch_first:
# (samples, timesteps, output_size)
y = y.contiguous().view(x.size(0), -1, y.size(-1))
else:
# (timesteps, samples, output_size)
y = y.contiguous().view(-1, x.size(1), y.size(-1))
return y