7
7
from . import content , dates , languages , titles , urls , webpages
8
8
9
9
# work around to read the version from the pyproject.toml so it is maintained in one place
10
- __version__ = importlib .metadata .version (' mediacloud-metadata' )
10
+ __version__ = importlib .metadata .version (" mediacloud-metadata" )
11
11
12
12
logger = logging .getLogger (__name__ )
13
13
14
14
# Publication dates more than this many days in the future will be ignored (because they are probably bad guesses)
15
15
MAX_FUTURE_PUB_DATE = 90
16
16
17
- STAT_NAMES = [' total' , ' fetch' , ' url' , ' pub_date' , ' content' , ' title' , ' language' ]
17
+ STAT_NAMES = [" total" , " fetch" , " url" , " pub_date" , " content" , " title" , " language" ]
18
18
stats = {s : 0 for s in STAT_NAMES }
19
19
20
20
21
- def extract (url : str , html_text : Optional [str ] = None , include_other_metadata : Optional [bool ] = False ,
22
- defaults : Mapping [str , Any ] = {}, overrides : Mapping [str , Any ] = {},
23
- stats_accumulator : Mapping [str , int ] = None ) -> Dict :
21
+ def extract (
22
+ url : str ,
23
+ html_text : Optional [str ] = None ,
24
+ include_other_metadata : Optional [bool ] = False ,
25
+ defaults : Mapping [str , Any ] = {},
26
+ overrides : Mapping [str , Any ] = {},
27
+ stats_accumulator : Mapping [str , int ] = None ,
28
+ ) -> Dict :
24
29
"""
25
30
The core method of this library - returns all the useful information extracted from the HTML of the next
26
31
article at the supplied URL.
@@ -43,28 +48,34 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
43
48
timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
44
49
for `STAT_NAMES` (see above).
45
50
"""
46
- if stats_accumulator is None : # can't default to global because of Python reference handling in defaults
51
+ if (
52
+ stats_accumulator is None
53
+ ): # can't default to global because of Python reference handling in defaults
47
54
stats_accumulator = stats
48
55
t0 = time .monotonic ()
49
56
# first fetch the real content (if we need to)
50
57
t1 = t0
51
58
if html_text is None :
52
59
raw_html , response = webpages .fetch (url )
53
60
# check for archived URLs
54
- if ' memento-datetime' in response .headers :
61
+ if " memento-datetime" in response .headers :
55
62
try :
56
- final_url = response .links ['original' ]['url' ] # the original url archived
63
+ final_url = response .links ["original" ][
64
+ "url"
65
+ ] # the original url archived
57
66
except KeyError :
58
67
# maybe the responder doesn't provide the desired headers, so just fall back on the full URL because
59
68
# there's nothing else we can really do
60
69
final_url = response .url # followed all the redirects
61
70
else :
62
71
final_url = response .url # followed all the redirects
63
72
else :
64
- final_url = url # trust that the user knows which URL the content actually came from
73
+ final_url = (
74
+ url # trust that the user knows which URL the content actually came from
75
+ )
65
76
raw_html = html_text
66
77
fetch_duration = time .monotonic () - t1
67
- stats_accumulator [' fetch' ] += fetch_duration
78
+ stats_accumulator [" fetch" ] += fetch_duration
68
79
69
80
# url
70
81
t1 = time .monotonic ()
@@ -73,60 +84,65 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
73
84
is_homepage_url = urls .is_homepage_url (url )
74
85
is_shortened_url = urls .is_shortened_url (url )
75
86
url_duration = time .monotonic () - t1
76
- stats_accumulator [' url' ] += url_duration
87
+ stats_accumulator [" url" ] += url_duration
77
88
78
89
# pub date stuff
79
90
t1 = time .monotonic ()
80
91
max_pub_date = dt .datetime .now () + dt .timedelta (days = + MAX_FUTURE_PUB_DATE )
81
- if ' publication_date' in overrides :
82
- pub_date = overrides [' publication_date' ]
92
+ if " publication_date" in overrides :
93
+ pub_date = overrides [" publication_date" ]
83
94
else :
84
- default_date = defaults .get ('publication_date' ) if defaults else None
85
- pub_date = dates .guess_publication_date (raw_html , final_url , max_date = max_pub_date , default_date = default_date )
95
+ default_date = defaults .get ("publication_date" ) if defaults else None
96
+ pub_date = dates .guess_publication_date (
97
+ raw_html , final_url , max_date = max_pub_date , default_date = default_date
98
+ )
86
99
pub_date_duration = time .monotonic () - t1
87
- stats_accumulator [' pub_date' ] += pub_date_duration
100
+ stats_accumulator [" pub_date" ] += pub_date_duration
88
101
89
102
# content
90
103
t1 = time .monotonic ()
91
- if 'text_content' in overrides :
92
- article = dict (extraction_method = content .METHOD_OVERRIDEN ,
93
- text = overrides ['text_content' ])
104
+ if "text_content" in overrides :
105
+ article = dict (
106
+ extraction_method = content .METHOD_OVERRIDEN , text = overrides ["text_content" ]
107
+ )
94
108
else :
95
109
article = content .from_html (final_url , raw_html , include_other_metadata )
96
110
content_duration = time .monotonic () - t1
97
- stats_accumulator [' content' ] += content_duration
111
+ stats_accumulator [" content" ] += content_duration
98
112
99
113
# title
100
114
t1 = time .monotonic ()
101
- if ' article_title' in overrides :
102
- article_title = overrides [' article_title' ]
115
+ if " article_title" in overrides :
116
+ article_title = overrides [" article_title" ]
103
117
else :
104
- article_title = titles .from_html (raw_html , article [' title' ])
118
+ article_title = titles .from_html (raw_html , article [" title" ])
105
119
if article_title is None :
106
- article_title = defaults .get (' article_title' ) if defaults else None
120
+ article_title = defaults .get (" article_title" ) if defaults else None
107
121
normalized_title = titles .normalize_title (article_title )
108
122
title_duration = time .monotonic () - t1
109
- stats_accumulator [' title' ] += title_duration
123
+ stats_accumulator [" title" ] += title_duration
110
124
111
125
# language
112
126
t1 = time .monotonic ()
113
- if ' language' in overrides :
114
- full_language = overrides [' language' ]
127
+ if " language" in overrides :
128
+ full_language = overrides [" language" ]
115
129
else :
116
- full_language = languages .from_html (raw_html , article ['text' ]) # could be something like "pt-br"
130
+ full_language = languages .from_html (
131
+ raw_html , article ["text" ]
132
+ ) # could be something like "pt-br"
117
133
if full_language is None :
118
- full_language = defaults .get (' language' ) if defaults else None
134
+ full_language = defaults .get (" language" ) if defaults else None
119
135
language_duration = time .monotonic () - t1
120
- stats_accumulator [' language' ] += language_duration
136
+ stats_accumulator [" language" ] += language_duration
121
137
122
138
# canonical url
123
- if ' canonical_url' in overrides :
124
- canonical_url = overrides [' canonical_url' ]
139
+ if " canonical_url" in overrides :
140
+ canonical_url = overrides [" canonical_url" ]
125
141
else :
126
- canonical_url = article .get (' canonical_url' )
142
+ canonical_url = article .get (" canonical_url" )
127
143
128
144
total_duration = time .monotonic () - t0
129
- stats_accumulator [' total' ] += total_duration
145
+ stats_accumulator [" total" ] += total_duration
130
146
131
147
results = dict (
132
148
original_url = url ,
@@ -136,23 +152,31 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
136
152
canonical_domain = canonical_domain ,
137
153
canonical_url = canonical_url ,
138
154
publication_date = pub_date ,
139
- language = full_language [:2 ] if full_language else full_language , # keep this as a two-letter code, like "en"
155
+ language = (
156
+ full_language [:2 ] if full_language else full_language
157
+ ), # keep this as a two-letter code, like "en"
140
158
full_language = full_language , # could be a full region language code, like "en-AU"
141
- text_extraction_method = article [' extraction_method' ],
159
+ text_extraction_method = article [" extraction_method" ],
142
160
article_title = article_title ,
143
161
normalized_article_title = normalized_title ,
144
- text_content = article [' text' ],
162
+ text_content = article [" text" ],
145
163
is_homepage = is_homepage_url ,
146
164
is_shortened = is_shortened_url ,
147
165
version = __version__ ,
148
166
)
149
167
if include_other_metadata :
150
168
# other metadata we've done less robust validation on, but might be useful
151
- results ['other' ] = dict (
152
- raw_title = article ['title' ] if 'title' in article else None ,
153
- raw_publish_date = article ['potential_publish_date' ] if 'potential_publish_date' in article else None ,
154
- top_image_url = article ['top_image_url' ] if 'top_image_url' in article else None ,
155
- authors = article ['authors' ] if 'authors' in article else None ,
169
+ results ["other" ] = dict (
170
+ raw_title = article ["title" ] if "title" in article else None ,
171
+ raw_publish_date = (
172
+ article ["potential_publish_date" ]
173
+ if "potential_publish_date" in article
174
+ else None
175
+ ),
176
+ top_image_url = (
177
+ article ["top_image_url" ] if "top_image_url" in article else None
178
+ ),
179
+ authors = article ["authors" ] if "authors" in article else None ,
156
180
)
157
181
158
182
return results
0 commit comments