-
Notifications
You must be signed in to change notification settings - Fork 1
/
split_temporal.py
137 lines (108 loc) · 4.17 KB
/
split_temporal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# To run: split_temporal.py 'collection_name' 1 1
# 1st argument : collection name (agriculture, development, environment, industrialization, lifestyle)
# 2nd argument : split? (yes:1 , no:0)
# 3rd argument : train? (yes:1 , no:0)
#!/usr/bin/env python
# coding: utf-8
import sys
import pandas as pd
import pickle
import zlib
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
#Folder and file paths (Make sure paths are correct before using this file)
FOLDER = './' # Folder with this code
PATHS = {'Dataset':FOLDER+'Datasets/','Train':FOLDER+'Split/Temporal/train_dataset_','Test':FOLDER+'Split/Temporal/test_dataset_','Model':FOLDER+'Split/Temporal/model_'}
df = pd.read_csv('Files/pace.csv')
dist_map = {}
census = list(df['census_code'])
label = list(df['labels'])
for i in range(len(census)):
dist_map[census[i]] = label[i]
# ----Split a single collection
def Split(dataset_name):
# Printing the collection name.
collection_name = dataset_name[8:]
print('\nCollection:',collection_name.capitalize())
# Loading the dataset and the model from the drive.
file = open(PATHS['Dataset']+dataset_name, 'rb')
dataset = pickle.loads(zlib.decompress(pickle.load(file)))
file.close()
df = pd.DataFrame(dataset)
df.columns = ['ArticleId','Title','Text','Keywords','Date','DistrictId','Emp','Growth','Type']
Year = 2018 # Year chosen for split
test_df, train_df = [x for _, x in df.groupby((df['Date'].str[0]).apply(pd.to_numeric) < Year)]
print(len(train_df))
print(len(test_df))
train_df = train_df.values.tolist()
test_df = test_df.values.tolist()
file_train = open(PATHS['Train']+collection_name,'wb')
pickle.dump(zlib.compress(pickle.dumps(train_df),pickle.HIGHEST_PROTOCOL),file_train,pickle.HIGHEST_PROTOCOL)
file_train.close()
file_test = open(PATHS['Test']+collection_name,'wb')
pickle.dump(zlib.compress(pickle.dumps(test_df),pickle.HIGHEST_PROTOCOL),file_test,pickle.HIGHEST_PROTOCOL)
file_test.close()
# -- Running model training
def DT2V_train(collection_name):
print(collection_name.capitalize())
# Loading train dataset
file_train = open(PATHS['Train']+collection_name,'rb')
dataset = pickle.loads(zlib.decompress(pickle.load(file_train)))
file_train.close()
# Creating the documents with required tags
documents = [TaggedDocument(i[3],[i[0],i[5],i[6],i[7]]) for i in dataset]
print('Documents Collected.')
# Declaring the DT2V Model.
model = Doc2Vec(vector_size=50,window=3,min_count=3,alpha=0.1,min_alpha=0.001)
print('Model Initialized.')
# Building vocabulary.
model.build_vocab(documents)
print('Vocabulary size: ',len(model.wv.vocab.keys()))
# Training model.
start = time.time()
for epoch in range(1,101):
model.train(documents,total_examples=len(documents),epochs=1)
if epoch==1 or epoch%10==0:
print('Epoch :',epoch, cosine_similarity([model.docvecs[94],model.docvecs[519]])[0][1])
print('Elasped Time: ',time.time()-start)
print('Model Trained.')
# Saving model.
file = open(PATHS['Model']+collection_name,'wb')
model.save(file)
file.close()
print('Model Saved.\n\n')
#----------- Methods end---------------#
#------------- Main-------------------#
# Create dataset
# Collect argument and specify the collection
collec = sys.argv[1]
if collec == 'agriculture':
dataset = 'dataset_agriculture'
elif collec == 'development':
dataset = 'dataset_development'
elif collec == 'environment':
dataset = 'dataset_environment'
elif collec == 'industrialization':
dataset = 'dataset_industrialization'
elif collec == 'lifestyle':
dataset = 'dataset_lifestyle'
else:
dataset = None
print('Dataset not specified')
exit(0)
SPLIT = sys.argv[2]
TRAIN = sys.argv[3]
# Split the dataset
if SPLIT=='1':
start = time.time()
Split(dataset)
end = time.time()
print('Time to modify: ',end-start)
# Train the model
if TRAIN=='1':
start = time.time()
DT2V_train(str(dataset[8:]))
end = time.time()
print('Time to train: ',end-start)