-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy path5_Skip_Gram.py
120 lines (105 loc) · 3.5 KB
/
5_Skip_Gram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#################################
### Author: Paul Soto ###
### [email protected] ###
# #
# This file is a script to run ##
# a Skip Gram Model using a toy #
# sample of documents. While ####
# negative sampling should be ###
# used to train the neural ######
# network, I use gradient #######
# descent to focus on the #######
# architecture rather than the ##
# optimal estimation ############
#################################
import numpy as np
from collections import Counter
import re
import itertools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# M is the number of words to look (on one side) of each word for the context
M = 2
# H is the dimension of the hidden layer
H = 3
def sigma(vector, deriv=False):
"""
This function returns a vector evaluated using the sigmoid function
vector: numpy array of real values
deriv: if True, evaluate first derivate of sigmoid
"""
if deriv:
return sigma(vector)*(1-sigma(vector))
else:
return np.exp(vector)/np.exp(vector).sum()
def get_context(word_list):
"""
This function returns the 2*M words in the context of each word in
the list
word_list: List of words
M: global variable of the window size
"""
samples = []
for word_ind in range(len(word_list)):
context_inds = range(max(0,word_ind-M),
min(word_ind+M+1,len(word_list)))
context_inds.remove(word_ind)
context = [word_list[el] for el in context_inds]
samples.append((word_list[word_ind],context))
return samples
docs = ["we think uncertainty about unemployment",
"uncertainty and fears about inflation",
"we think fears about unemployment",
"we think fears and uncertainty about inflation and unemployment",
"constant negative press covfefe"]
# Split each document into a list of words
docs_split = map(lambda x: x.split(),docs)
docs_words = list(itertools.chain(*docs_split))
# Find unique words across all documents
words = np.unique(docs_words)
# Generate a one hot encoded vector for each unique word
vectors = np.eye(words.shape[0])
# Initiate randomly V and W matrices
U = np.random.randn(H,words.shape[0])
V = np.random.randn(words.shape[0],H)
# Create list of all training examples
training = list(itertools.chain(*map(get_context,docs_split)))
log_likelihood = np.array([])
epochs = 10000
learning_rate = 0.001
tolerance = 0.001
discount = float(learning_rate)/epochs
for epoch in range(epochs):
likelihood = 0
for example in training:
# Forward propogate word
input_index = np.where(words==example[0])[0][0]
l_input = vectors[input_index]
l_hidden = np.dot(U,l_input)
l_output = np.dot(V,l_hidden)
l_output_a = sigma(l_output)
errors = np.zeros(words.shape[0])
# Compute the error for each word in context window
for context in example[1]:
output_index = np.where(words==context)[0][0]
l_target= vectors[output_index]
errors += (l_output_a-l_target)
# Update the weights of V and W matrices
delta2 = errors*sigma(l_output,True)
V -= learning_rate*np.outer(delta2,l_hidden)
U -= learning_rate*np.outer(np.dot(V.T,delta2),l_input)
likelihood+=sum(map(np.log,l_output_a))
log_likelihood=np.append(log_likelihood,likelihood)
learning_rate -= discount
if epoch<2: continue
if (abs(likelihood-log_likelihood[-2])<tolerance):
break
# Plot out word embeddings and log-likelihood function
fig = plt.figure()
ax = fig.add_subplot(1,2,1,projection="3d")
ax.scatter(U[0],U[1],U[2], alpha=0.3)
for i,txt in enumerate(words):
ax.text(U[0][i],U[1][i],U[2][i],txt, size=10)
ax = fig.add_subplot(1,2,2)
ax.plot(log_likelihood)
plt.show()