Skip to content

Commit baa8d42

Browse files
committed
add black formatter to pre-commit hooks
1 parent b69a588 commit baa8d42

25 files changed

+1334
-958
lines changed

.github/workflows/publish-to-pypi.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
22

3-
on:
3+
on:
44
push:
5-
tags:
5+
tags:
66
- "*"
77

88
jobs:

.pre-commit-config.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ repos:
1515
# hooks:
1616
# - id: mypy
1717
# entry: mypy
18+
- repo: http://github.com/ambv/black
19+
rev: 24.4.2
20+
hooks:
21+
- id: black
22+
language_version: python3.10
1823
- repo: http://github.com/pycqa/flake8
1924
rev: 6.1.0
2025
hooks:

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ but also build on numerous 3rd party libraries. The metadata extracted includes:
1515
* the text content of the news article
1616
* the name of the library used to extract the article content
1717

18-
Other often-reused methods and configuration related to the mediacloud service also live in this package.
18+
Other often-reused methods and configuration related to the mediacloud service also live in this package.
1919

2020

2121
Installation

conftest.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
def pytest_addoption(parser):
77
parser.addoption(
8-
'--use-cache', default=True, nargs="?", const=True,
9-
help='Use cached versions of content instead of fetching at every step',
10-
type=lambda x: bool(strtobool(x))
8+
"--use-cache",
9+
default=True,
10+
nargs="?",
11+
const=True,
12+
help="Use cached versions of content instead of fetching at every step",
13+
type=lambda x: bool(strtobool(x)),
1114
)

mcmetadata/__init__.py

+66-42
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,25 @@
77
from . import content, dates, languages, titles, urls, webpages
88

99
# work around to read the version from the pyproject.toml so it is maintained in one place
10-
__version__ = importlib.metadata.version('mediacloud-metadata')
10+
__version__ = importlib.metadata.version("mediacloud-metadata")
1111

1212
logger = logging.getLogger(__name__)
1313

1414
# Publication dates more than this many days in the future will be ignored (because they are probably bad guesses)
1515
MAX_FUTURE_PUB_DATE = 90
1616

17-
STAT_NAMES = ['total', 'fetch', 'url', 'pub_date', 'content', 'title', 'language']
17+
STAT_NAMES = ["total", "fetch", "url", "pub_date", "content", "title", "language"]
1818
stats = {s: 0 for s in STAT_NAMES}
1919

2020

21-
def extract(url: str, html_text: Optional[str] = None, include_other_metadata: Optional[bool] = False,
22-
defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {},
23-
stats_accumulator: Mapping[str, int] = None) -> Dict:
21+
def extract(
22+
url: str,
23+
html_text: Optional[str] = None,
24+
include_other_metadata: Optional[bool] = False,
25+
defaults: Mapping[str, Any] = {},
26+
overrides: Mapping[str, Any] = {},
27+
stats_accumulator: Mapping[str, int] = None,
28+
) -> Dict:
2429
"""
2530
The core method of this library - returns all the useful information extracted from the HTML of the next
2631
article at the supplied URL.
@@ -43,28 +48,34 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
4348
timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
4449
for `STAT_NAMES` (see above).
4550
"""
46-
if stats_accumulator is None: # can't default to global because of Python reference handling in defaults
51+
if (
52+
stats_accumulator is None
53+
): # can't default to global because of Python reference handling in defaults
4754
stats_accumulator = stats
4855
t0 = time.monotonic()
4956
# first fetch the real content (if we need to)
5057
t1 = t0
5158
if html_text is None:
5259
raw_html, response = webpages.fetch(url)
5360
# check for archived URLs
54-
if 'memento-datetime' in response.headers:
61+
if "memento-datetime" in response.headers:
5562
try:
56-
final_url = response.links['original']['url'] # the original url archived
63+
final_url = response.links["original"][
64+
"url"
65+
] # the original url archived
5766
except KeyError:
5867
# maybe the responder doesn't provide the desired headers, so just fall back on the full URL because
5968
# there's nothing else we can really do
6069
final_url = response.url # followed all the redirects
6170
else:
6271
final_url = response.url # followed all the redirects
6372
else:
64-
final_url = url # trust that the user knows which URL the content actually came from
73+
final_url = (
74+
url # trust that the user knows which URL the content actually came from
75+
)
6576
raw_html = html_text
6677
fetch_duration = time.monotonic() - t1
67-
stats_accumulator['fetch'] += fetch_duration
78+
stats_accumulator["fetch"] += fetch_duration
6879

6980
# url
7081
t1 = time.monotonic()
@@ -73,60 +84,65 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
7384
is_homepage_url = urls.is_homepage_url(url)
7485
is_shortened_url = urls.is_shortened_url(url)
7586
url_duration = time.monotonic() - t1
76-
stats_accumulator['url'] += url_duration
87+
stats_accumulator["url"] += url_duration
7788

7889
# pub date stuff
7990
t1 = time.monotonic()
8091
max_pub_date = dt.datetime.now() + dt.timedelta(days=+MAX_FUTURE_PUB_DATE)
81-
if 'publication_date' in overrides:
82-
pub_date = overrides['publication_date']
92+
if "publication_date" in overrides:
93+
pub_date = overrides["publication_date"]
8394
else:
84-
default_date = defaults.get('publication_date') if defaults else None
85-
pub_date = dates.guess_publication_date(raw_html, final_url, max_date=max_pub_date, default_date=default_date)
95+
default_date = defaults.get("publication_date") if defaults else None
96+
pub_date = dates.guess_publication_date(
97+
raw_html, final_url, max_date=max_pub_date, default_date=default_date
98+
)
8699
pub_date_duration = time.monotonic() - t1
87-
stats_accumulator['pub_date'] += pub_date_duration
100+
stats_accumulator["pub_date"] += pub_date_duration
88101

89102
# content
90103
t1 = time.monotonic()
91-
if 'text_content' in overrides:
92-
article = dict(extraction_method=content.METHOD_OVERRIDEN,
93-
text=overrides['text_content'])
104+
if "text_content" in overrides:
105+
article = dict(
106+
extraction_method=content.METHOD_OVERRIDEN, text=overrides["text_content"]
107+
)
94108
else:
95109
article = content.from_html(final_url, raw_html, include_other_metadata)
96110
content_duration = time.monotonic() - t1
97-
stats_accumulator['content'] += content_duration
111+
stats_accumulator["content"] += content_duration
98112

99113
# title
100114
t1 = time.monotonic()
101-
if 'article_title' in overrides:
102-
article_title = overrides['article_title']
115+
if "article_title" in overrides:
116+
article_title = overrides["article_title"]
103117
else:
104-
article_title = titles.from_html(raw_html, article['title'])
118+
article_title = titles.from_html(raw_html, article["title"])
105119
if article_title is None:
106-
article_title = defaults.get('article_title') if defaults else None
120+
article_title = defaults.get("article_title") if defaults else None
107121
normalized_title = titles.normalize_title(article_title)
108122
title_duration = time.monotonic() - t1
109-
stats_accumulator['title'] += title_duration
123+
stats_accumulator["title"] += title_duration
110124

111125
# language
112126
t1 = time.monotonic()
113-
if 'language' in overrides:
114-
full_language = overrides['language']
127+
if "language" in overrides:
128+
full_language = overrides["language"]
115129
else:
116-
full_language = languages.from_html(raw_html, article['text']) # could be something like "pt-br"
130+
full_language = languages.from_html(
131+
raw_html, article["text"]
132+
) # could be something like "pt-br"
117133
if full_language is None:
118-
full_language = defaults.get('language') if defaults else None
134+
full_language = defaults.get("language") if defaults else None
119135
language_duration = time.monotonic() - t1
120-
stats_accumulator['language'] += language_duration
136+
stats_accumulator["language"] += language_duration
121137

122138
# canonical url
123-
if 'canonical_url' in overrides:
124-
canonical_url = overrides['canonical_url']
139+
if "canonical_url" in overrides:
140+
canonical_url = overrides["canonical_url"]
125141
else:
126-
canonical_url = article.get('canonical_url')
142+
canonical_url = article.get("canonical_url")
127143

128144
total_duration = time.monotonic() - t0
129-
stats_accumulator['total'] += total_duration
145+
stats_accumulator["total"] += total_duration
130146

131147
results = dict(
132148
original_url=url,
@@ -136,23 +152,31 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
136152
canonical_domain=canonical_domain,
137153
canonical_url=canonical_url,
138154
publication_date=pub_date,
139-
language=full_language[:2] if full_language else full_language, # keep this as a two-letter code, like "en"
155+
language=(
156+
full_language[:2] if full_language else full_language
157+
), # keep this as a two-letter code, like "en"
140158
full_language=full_language, # could be a full region language code, like "en-AU"
141-
text_extraction_method=article['extraction_method'],
159+
text_extraction_method=article["extraction_method"],
142160
article_title=article_title,
143161
normalized_article_title=normalized_title,
144-
text_content=article['text'],
162+
text_content=article["text"],
145163
is_homepage=is_homepage_url,
146164
is_shortened=is_shortened_url,
147165
version=__version__,
148166
)
149167
if include_other_metadata:
150168
# other metadata we've done less robust validation on, but might be useful
151-
results['other'] = dict(
152-
raw_title=article['title'] if 'title' in article else None,
153-
raw_publish_date=article['potential_publish_date'] if 'potential_publish_date' in article else None,
154-
top_image_url=article['top_image_url'] if 'top_image_url' in article else None,
155-
authors=article['authors'] if 'authors' in article else None,
169+
results["other"] = dict(
170+
raw_title=article["title"] if "title" in article else None,
171+
raw_publish_date=(
172+
article["potential_publish_date"]
173+
if "potential_publish_date" in article
174+
else None
175+
),
176+
top_image_url=(
177+
article["top_image_url"] if "top_image_url" in article else None
178+
),
179+
authors=article["authors"] if "authors" in article else None,
156180
)
157181

158182
return results

0 commit comments

Comments
 (0)