-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_array.py
49 lines (40 loc) · 1.32 KB
/
feature_array.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from mongo import load_from_mongo
import numpy as np
import re
import lda
feature_array=[]
result=load_from_mongo("hindu_modified","docs1")
f=open('vocab.txt','r')
feature_vector=f.read().split('\n')
#print len(feature_vector)
len_of_feature_vector=len(feature_vector)
#print feature_vector[:10]
for each in result:
text=each["text"]
#print text
text = re.sub(r'[^a-zA-Z0-9 ]',' ',text)
text_tokens=text.split(" ")
text_dist={}
for each in text_tokens:
if each.lower() in text_dist.keys():
text_dist[each.lower()] =text_dist[each.lower()]+1
else:
text_dist[each.lower()] = 1
vector=[0]*len_of_feature_vector
for each in text_dist.keys():
#print feature_vector.index(each)
vector[feature_vector.index(each)]=text_dist[each]
feature_array.append(vector)
print len(feature_array)
model=lda.LDA(n_topics=10,n_iter=500,random_state=1)
model.fit(feature_array)
topic_word=model.topic_word_
n_top_words=7
for i,topic_dist in enumerate(topic_word):
topic_words=np.array(tuple(feature_vector))[np.argsort(topic_dist)][:-n_top_words:-1]
print ("topic {}:{}".format(i, " ".join(topic_words)))
##titles=[each['HD'] for each in result]
##titles=tuple(titles)
##
##for i in range(5):
## print("{} (index-{}))".format(titles[i],i))