Skip to content

Commit

Permalink
Friendlier error when lzma is not there and importing trafilatura fails
Browse files Browse the repository at this point in the history
Fix #937
  • Loading branch information
Yomguithereal committed Feb 15, 2024
1 parent c3bb7e0 commit e8dc175
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 5 deletions.
14 changes: 12 additions & 2 deletions minet/cli/argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,14 @@ def build_parser(name, version, commands):

return parser, subparser_index

FOLDER_STRATEGY_CHOICES = ["flat", "fullpath", "hostname", "normalize-hostname", "prefix-x"]

FOLDER_STRATEGY_CHOICES = [
"flat",
"fullpath",
"hostname",
"normalize-hostname",
"prefix-x",
]

# NOTE: indentation IS important
FOLDER_STRATEGY_DOCUMENTATION = """
Expand Down Expand Up @@ -321,6 +328,7 @@ def build_parser(name, version, commands):
"fr.", for instance) and their public suffix will be dropped.
"""


class FolderStrategyType:
def __call__(self, name):
from minet.fs import FolderStrategy
Expand Down Expand Up @@ -601,7 +609,9 @@ def __init__(
**kwargs,
):
fields_help = (
"Available flags are: " + and_join([f"`{f}`" for f in TRAFILATURA_FIELDNAMES]) + "."
"Available flags are: "
+ and_join([f"`{f}`" for f in TRAFILATURA_FIELDNAMES])
+ "."
)
help = fields_help if not help else help + " " + fields_help
super().__init__(option_strings, dest, help=help, default=default, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion minet/cli/crawl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
command,
FolderStrategyType,
BooleanAction,
ExtractionSelectionAction,FOLDER_STRATEGY_DOCUMENTATION
ExtractionSelectionAction,
FOLDER_STRATEGY_DOCUMENTATION,
)
from minet.cli.exceptions import InvalidArgumentsError

Expand Down
6 changes: 5 additions & 1 deletion minet/cli/fetch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from casanova import IndexedResumer

from minet.cli.argparse import command, FolderStrategyType, FOLDER_STRATEGY_DOCUMENTATION
from minet.cli.argparse import (
command,
FolderStrategyType,
FOLDER_STRATEGY_DOCUMENTATION,
)
from minet.cli.constants import DEFAULT_CONTENT_FOLDER, DEFAULT_SCREENSHOT_FOLDER
from minet.cli.exceptions import InvalidArgumentsError

Expand Down
11 changes: 10 additions & 1 deletion minet/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@

from dataclasses import dataclass, field
from casanova import TabularRecord
from trafilatura.core import bare_extraction

try:
from trafilatura.core import bare_extraction
except ModuleNotFoundError as e:
if "lzma" in str(e):
raise ImportError(
"cannot import trafilatura because your version of python was not compiled with lzma.\nSee https://stackoverflow.com/questions/57743230/userwarning-could-not-import-the-lzma-module-your-installed-python-is-incomple for potential solutions."
)

raise

from minet.exceptions import TrafilaturaError
from minet.encodings import fix_surrogates
Expand Down

0 comments on commit e8dc175

Please sign in to comment.