mediacloud
diff --git a/‎.github/workflows/publish-to-pypi.yml
+2-2 b/‎.github/workflows/publish-to-pypi.yml
+2-2
diff --git a/‎.pre-commit-config.yaml
+5 b/‎.pre-commit-config.yaml
+5
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎conftest.py
+6-3 b/‎conftest.py
+6-3
diff --git a/‎mcmetadata/__init__.py
+66-42 b/‎mcmetadata/__init__.py
+66-42
@@ -1,8 +1,8 @@
 name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
 
-on: 
+on:
   push:
-    tags: 
+    tags:
       - "*"
 
 jobs:
 
@@ -15,6 +15,11 @@ repos:
 #    hooks:
 #      - id: mypy
 #        entry: mypy
+  - repo: http://github.com/ambv/black
+    rev: 24.4.2
+    hooks:
+      - id: black
+        language_version: python3.10
   - repo: http://github.com/pycqa/flake8
     rev: 6.1.0
     hooks:
 
@@ -15,7 +15,7 @@ but also build on numerous 3rd party libraries. The metadata extracted includes:
 * the text content of the news article
 * the name of the library used to extract the article content
 
-Other often-reused methods and configuration related to the mediacloud service also live in this package. 
+Other often-reused methods and configuration related to the mediacloud service also live in this package.
 
 
 Installation
 
@@ -5,7 +5,10 @@
 
 def pytest_addoption(parser):
     parser.addoption(
-        '--use-cache', default=True, nargs="?", const=True,
-        help='Use cached versions of content instead of fetching at every step',
-        type=lambda x: bool(strtobool(x))
+        "--use-cache",
+        default=True,
+        nargs="?",
+        const=True,
+        help="Use cached versions of content instead of fetching at every step",
+        type=lambda x: bool(strtobool(x)),
     )
@@ -7,20 +7,25 @@
 from . import content, dates, languages, titles, urls, webpages
 
 # work around to read the version from the pyproject.toml so it is maintained in one place
-__version__ = importlib.metadata.version('mediacloud-metadata')
+__version__ = importlib.metadata.version("mediacloud-metadata")
 
 logger = logging.getLogger(__name__)
 
 # Publication dates more than this many days in the future will be ignored (because they are probably bad guesses)
 MAX_FUTURE_PUB_DATE = 90
 
-STAT_NAMES = ['total', 'fetch', 'url', 'pub_date', 'content', 'title', 'language']
+STAT_NAMES = ["total", "fetch", "url", "pub_date", "content", "title", "language"]
 stats = {s: 0 for s in STAT_NAMES}
 
 
-def extract(url: str, html_text: Optional[str] = None, include_other_metadata: Optional[bool] = False,
-            defaults: Mapping[str, Any] = {}, overrides: Mapping[str, Any] = {},
-            stats_accumulator: Mapping[str, int] = None) -> Dict:
+def extract(
+    url: str,
+    html_text: Optional[str] = None,
+    include_other_metadata: Optional[bool] = False,
+    defaults: Mapping[str, Any] = {},
+    overrides: Mapping[str, Any] = {},
+    stats_accumulator: Mapping[str, int] = None,
+) -> Dict:
     """
     The core method of this library - returns all the useful information extracted from the HTML of the next
     article at the supplied URL.
@@ -43,28 +48,34 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
                  timings for the call will _not_ be added to the module-level `stats` counter. Should contain keys
                  for `STAT_NAMES` (see above).
     """
-    if stats_accumulator is None:  # can't default to global because of Python reference handling in defaults
+    if (
+        stats_accumulator is None
+    ):  # can't default to global because of Python reference handling in defaults
         stats_accumulator = stats
     t0 = time.monotonic()
     # first fetch the real content (if we need to)
     t1 = t0
     if html_text is None:
         raw_html, response = webpages.fetch(url)
         # check for archived URLs
-        if 'memento-datetime' in response.headers:
+        if "memento-datetime" in response.headers:
             try:
-                final_url = response.links['original']['url']  # the original url archived
+                final_url = response.links["original"][
+                    "url"
+                ]  # the original url archived
             except KeyError:
                 # maybe the responder doesn't provide the desired headers, so just fall back on the full URL because
                 # there's nothing else we can really do
                 final_url = response.url  # followed all the redirects
         else:
             final_url = response.url  # followed all the redirects
     else:
-        final_url = url  # trust that the user knows which URL the content actually came from
+        final_url = (
+            url  # trust that the user knows which URL the content actually came from
+        )
         raw_html = html_text
     fetch_duration = time.monotonic() - t1
-    stats_accumulator['fetch'] += fetch_duration
+    stats_accumulator["fetch"] += fetch_duration
 
     # url
     t1 = time.monotonic()
@@ -73,60 +84,65 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
     is_homepage_url = urls.is_homepage_url(url)
     is_shortened_url = urls.is_shortened_url(url)
     url_duration = time.monotonic() - t1
-    stats_accumulator['url'] += url_duration
+    stats_accumulator["url"] += url_duration
 
     # pub date stuff
     t1 = time.monotonic()
     max_pub_date = dt.datetime.now() + dt.timedelta(days=+MAX_FUTURE_PUB_DATE)
-    if 'publication_date' in overrides:
-        pub_date = overrides['publication_date']
+    if "publication_date" in overrides:
+        pub_date = overrides["publication_date"]
     else:
-        default_date = defaults.get('publication_date') if defaults else None
-        pub_date = dates.guess_publication_date(raw_html, final_url, max_date=max_pub_date, default_date=default_date)
+        default_date = defaults.get("publication_date") if defaults else None
+        pub_date = dates.guess_publication_date(
+            raw_html, final_url, max_date=max_pub_date, default_date=default_date
+        )
     pub_date_duration = time.monotonic() - t1
-    stats_accumulator['pub_date'] += pub_date_duration
+    stats_accumulator["pub_date"] += pub_date_duration
 
     # content
     t1 = time.monotonic()
-    if 'text_content' in overrides:
-        article = dict(extraction_method=content.METHOD_OVERRIDEN,
-                       text=overrides['text_content'])
+    if "text_content" in overrides:
+        article = dict(
+            extraction_method=content.METHOD_OVERRIDEN, text=overrides["text_content"]
+        )
     else:
         article = content.from_html(final_url, raw_html, include_other_metadata)
     content_duration = time.monotonic() - t1
-    stats_accumulator['content'] += content_duration
+    stats_accumulator["content"] += content_duration
 
     # title
     t1 = time.monotonic()
-    if 'article_title' in overrides:
-        article_title = overrides['article_title']
+    if "article_title" in overrides:
+        article_title = overrides["article_title"]
     else:
-        article_title = titles.from_html(raw_html, article['title'])
+        article_title = titles.from_html(raw_html, article["title"])
         if article_title is None:
-            article_title = defaults.get('article_title') if defaults else None
+            article_title = defaults.get("article_title") if defaults else None
     normalized_title = titles.normalize_title(article_title)
     title_duration = time.monotonic() - t1
-    stats_accumulator['title'] += title_duration
+    stats_accumulator["title"] += title_duration
 
     # language
     t1 = time.monotonic()
-    if 'language' in overrides:
-        full_language = overrides['language']
+    if "language" in overrides:
+        full_language = overrides["language"]
     else:
-        full_language = languages.from_html(raw_html, article['text'])  # could be something like "pt-br"
+        full_language = languages.from_html(
+            raw_html, article["text"]
+        )  # could be something like "pt-br"
         if full_language is None:
-            full_language = defaults.get('language') if defaults else None
+            full_language = defaults.get("language") if defaults else None
     language_duration = time.monotonic() - t1
-    stats_accumulator['language'] += language_duration
+    stats_accumulator["language"] += language_duration
 
     # canonical url
-    if 'canonical_url' in overrides:
-        canonical_url = overrides['canonical_url']
+    if "canonical_url" in overrides:
+        canonical_url = overrides["canonical_url"]
     else:
-        canonical_url = article.get('canonical_url')
+        canonical_url = article.get("canonical_url")
 
     total_duration = time.monotonic() - t0
-    stats_accumulator['total'] += total_duration
+    stats_accumulator["total"] += total_duration
 
     results = dict(
         original_url=url,
@@ -136,23 +152,31 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
         canonical_domain=canonical_domain,
         canonical_url=canonical_url,
         publication_date=pub_date,
-        language=full_language[:2] if full_language else full_language,  # keep this as a two-letter code, like "en"
+        language=(
+            full_language[:2] if full_language else full_language
+        ),  # keep this as a two-letter code, like "en"
         full_language=full_language,  # could be a full region language code, like "en-AU"
-        text_extraction_method=article['extraction_method'],
+        text_extraction_method=article["extraction_method"],
         article_title=article_title,
         normalized_article_title=normalized_title,
-        text_content=article['text'],
+        text_content=article["text"],
         is_homepage=is_homepage_url,
         is_shortened=is_shortened_url,
         version=__version__,
     )
     if include_other_metadata:
         # other metadata we've done less robust validation on, but might be useful
-        results['other'] = dict(
-            raw_title=article['title'] if 'title' in article else None,
-            raw_publish_date=article['potential_publish_date'] if 'potential_publish_date' in article else None,
-            top_image_url=article['top_image_url'] if 'top_image_url' in article else None,
-            authors=article['authors'] if 'authors' in article else None,
+        results["other"] = dict(
+            raw_title=article["title"] if "title" in article else None,
+            raw_publish_date=(
+                article["potential_publish_date"]
+                if "potential_publish_date" in article
+                else None
+            ),
+            top_image_url=(
+                article["top_image_url"] if "top_image_url" in article else None
+            ),
+            authors=article["authors"] if "authors" in article else None,
         )
 
     return results
-Original file line number
+Diff line change
 name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
 -on:
 +on:
   push:
 -    tags:
 +    tags:
       - "*"
 jobs: