-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
112 lines (93 loc) · 3.19 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import lda
import json
import logging
import sys
import nltk
import numpy as np
import os.path
from newsapi import NewsApiClient
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AffinityPropagation, DBSCAN
from sklearn.metrics import adjusted_rand_score, jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.externals import joblib
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from goose3 import Goose
from time import time
import string
from gensim.models import HdpModel
from gensim.sklearn_api.hdp import HdpTransformer
from gensim import corpora, models
import gensim
stopset = list(set(stopwords.words('english'))) + list(string.punctuation)
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
tokens = [w for w in tokens if not w in stopset]
stems = []
for item in tokens:
stems.append(PorterStemmer().stem(item))
return stems
def jaccard(vector):
#
sims = cosine_similarity(vector)
return sims
#init
#opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
g = Goose()
timelines = []
stemmer = PorterStemmer()
newsapi = NewsApiClient(api_key='1f62f144d9584aaeb3fb553f42c989a6')
translator = str.maketrans('', '', string.punctuation)
if os.path.exists('./timelines.pkl'):
joblib.load('./timelines.pkl', timelines)
#start
#,abc-news,buzzfeed,daily-mail,bbc-news,cbc-news,cnn
top_headlines = newsapi.get_top_headlines(language='en', sources='google-news', page_size=100)
#get urls from news api
urls = [*filter(None,[*map(lambda x: x['url'], top_headlines['articles'])])]
# extract from article urls
articles = []
for u in urls:
try:
# response = opener.open(u)
# raw_html = response.read()
#content = Extractor(u)
content = g.extract(url=u)
#text = content.cleaned_text.lower()
text = content.cleaned_text.replace("\n", " ")
#no_punc = text.translate(translator)
text = tokenize(text)
#content.download()
#content.parse()
#content.nlp()
article = {}
# stemmed_words = set(stem_tokens(content.cleaned_text, stemmer))
article['keywords'] = text
article['url'] = u
article['title'] = content.title
articles.append(article)
except:
continue
#add existing articles to new articles
if len(timelines) > 0:
recent_timelines = [*map(lambda t: t[0], timelines)]
articles = recent_timelines + articles
texts = [*map(lambda x: x['keywords'],articles)]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
hdp = HdpModel(corpus, dictionary)
#print(hdp.print_topics(num_topics=3, num_words=10))
print(hdp.show_topics(num_topics=-1, num_words=10))
topics = hdp.print_topics(num_topics=-1)
texst = 1