-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_parser.py
36 lines (34 loc) · 1.33 KB
/
news_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# This module handles the API call and parses the resulting xml document tree.
import requests
from defusedxml.minidom import parseString
# defusedxml removes vulnerabilities
def get_google_news_result(term, count):
# Parses the string obtained from get request as xml document object
obj = parseString(
requests.get('http://news.google.com/news?q=%s&output=rss' %
term).text)
items = obj.getElementsByTagName('item')
# Storing the Titles , Agencies, Links and Info
titles = []
links = []
agencies = []
infos = []
for item in items[:count]:
title, agency , link = '', '',''
info = {}
for node in item.childNodes:
if node.nodeName == 'title':
title = node.childNodes[0].data
elif node.nodeName == 'link':
link = node.childNodes[0].data
elif node.nodeName == 'pubDate':
info["publication date"] = node.childNodes[0].data
elif node.nodeName == 'description':
info["description"] = node.childNodes[0].data
elif node.nodeName == 'source':
agency = node.childNodes[0].data
titles.append(title)
links.append(link)
infos.append(info)
agencies.append(agency)
return titles, agencies , links , infos