-
Notifications
You must be signed in to change notification settings - Fork 0
/
topicmodeling.py
61 lines (46 loc) · 2.51 KB
/
topicmodeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/home/hackpython/anaconda/bin/python
# Author: Abhishek Sharma
# Program: Implementation of Non-negative Matrix Factorization ( NMF ) and Latent Dirichlet Allocation ( LDA )
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import sys
import os
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic:", (topic_idx))
print(" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
def tfidf_vectorizer(documents,total_features):
# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
return tfidf_vectorizer,tfidf,tfidf_feature_names
def count_vectorizer(documents,total_features):
# Count Vectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
return tf_vectorizer,tf,tf_feature_names
# Data Set;
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
# Number of Features to select.
total_features = 1000
tfidf_vectorizer, tfidf, tfidf_feature_names = tfidf_vectorizer(documents,total_features)
tf_vectorizer, tf, tf_feature_names = count_vectorizer(documents,total_features)
num_topic = 5
# Non Negative Matrix Factorization Algorithm Implementation, to know more about parameter visit
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
model_nmf = NMF(n_components=num_topic, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Latent Dirichlet Allocation Implementation
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
model_lda = LatentDirichletAllocation(n_topics=num_topic, max_iter=30, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)
no_top_words = 10
display_topics(model_nmf, tfidf_feature_names, no_top_words)
display_topics(model_lda, tfidf_feature_names, no_top_words)
data = pyLDAvis.sklearn.prepare(model_lda,tfidf,tfidf_vectorizer)
pyLDAvis.show(data)