-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText_Minging.py
51 lines (40 loc) · 1.49 KB
/
Text_Minging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from cos_nmf import COSNMF
import numpy as np
import sys
corpus = ["this is the first document it is talking about the industrial revolution",
"this is the second document in which we discuss the importance of oral hygene",
"within this third document we will talk about the effect of radio active waves on the human body",
"lastly we would like to discuss the drawbacks of smoking of course"]
topicNbr = 2
vectorizer = TfidfVectorizer(stop_words="english")
tfidf = vectorizer.fit_transform(corpus)
M=tfidf.toarray()
nmf = NMF(n_components=topicNbr,random_state=1)
nmf.fit(tfidf)
np.set_printoptions(threshold=sys.maxsize)
print(M)
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
print("topic #%d" % topic_idx)
print(" ".join([feature_names[i] for i in topic.argsort()[:10 - 1:-1]]))
print()
for i, doc in enumerate(corpus):
print("Document #%d" % i)
print(doc)
print("Topic distribution:", nmf.transform(tfidf[i])[0])
print()
model = NMF(n_components=topicNbr, init='random', random_state=0)
W = model.fit_transform(M)
H = model.components_
print("M:\n ",M)
print("W:\n ",W)
print("H:\n ",H.T)
W1,H1=COSNMF(M,topicNbr)
print("W1: \n",W1)
print("H1: \n",H1)
for topic_idx, topic in enumerate(nmf.components_):
print(topic_idx,topic)
for i, doc in enumerate(corpus):
print(i,doc)