-
Notifications
You must be signed in to change notification settings - Fork 0
/
nyt_requests.py
96 lines (74 loc) · 3.07 KB
/
nyt_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import logging
import time
from unstructured.partition.html import partition_html
from lxml.html import tostring
from unstructured.staging.base import convert_to_dict
from nltk.tokenize import word_tokenize
from common import *
import google.cloud.logging
import google.auth
BASE_URL = "https://www.nytimes.com"
LINKS_XPATH = "//a"
@extractor_func(scraper="nyt", required=True)
def extract_article_section(root, output):
sub_sect = root.xpath("//div[@id='masthead-section-label']")[0]
output["subsection"] = sub_sect.text_content().strip()
@extractor_func(scraper="nyt", required=True)
def extract_article_title(root, output):
header_tag = root.xpath("//h1[@data-testid='headline']")[0]
output["title"] = header_tag.text_content().strip()
@extractor_func(scraper="nyt", required=True)
def extract_published_time(root, output):
time_tag = root.xpath("//time[@class='css-1g7pp1u e16638kd0']")[0]
output["published"] = time_tag.attrib["datetime"]
@extractor_func(scraper="nyt", required=False)
def extract_summary(root, output):
key_points = root.xpath("//p[@id='article-summary']")[0]
output["summary"] = key_points.text_content().strip()
@extractor_func(scraper="nyt", required=True)
def extract_body(root, output):
body_section = partition_html(text=tostring(root.xpath("//section[@name='articleBody']")[0]))
body_dicts = convert_to_dict(body_section)
accepted = {"Title", "NarrativeText"}
result = []
for d in body_dicts:
if len(word_tokenize(d["text"])) < 3:
continue
if "|" in d["text"]:
continue
if d["type"] not in accepted:
continue
result.append(
{
"type": d["type"],
"text": d["text"]
}
)
output["body"] = result
def create_new_state(credential_path=None):
writer = GCPBucketDirectoryWriter(bucket="nyt-articles",
credential_path=credential_path
)
with writer:
tracker = InMemProgressTracker(starting_set=[BASE_URL],
visited=writer.saved_pages,
filters=[create_robot_filter(BASE_URL),
create_regex_filter(r"https?://www\.nytimes\.com")])
getter = RequestGetter(retry=3)
return writer, tracker, getter
RESET_TIME = 3600 * 24
if __name__ == "__main__":
log_client = google.cloud.logging.Client(project="msca310019-capstone-f945")
log_client.setup_logging()
logging.getLogger().setLevel(logging.INFO)
writer, tracker, getter = create_new_state()
last_reset = time.time()
while True:
current_time = time.time()
if current_time - last_reset > RESET_TIME:
writer, tracker, getter = create_new_state()
logging.info("Restarting scrape to get new articles")
last_reset = time.time()
start_scraper("nyt", getter=getter, writer=writer, progressor=tracker, duration=5 * 60)
with writer:
writer.write_index()