Working of scrape cmd -m

Related to #763
medialab · Dec 11, 2023 · 949832e · 949832e
1 parent 56f0146
commit 949832e
Show file tree

Hide file tree

Showing 14 changed files with 241 additions and 244 deletions.
diff --git a/docs/cli.md b/docs/cli.md
@@ -1121,7 +1121,7 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/
 
 ```
 Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
-                    [--simple-progress] [-g] [-I INPUT_DIR] [-p PROCESSES]
+                    [--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES]
                     [--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN]
                     [--url-column URL_COLUMN] [--error-column ERROR_COLUMN]
                     [--status-column STATUS_COLUMN]
@@ -1138,6 +1138,8 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
 Use multiple processes to scrape data from a batch of HTML files using
 minet scraping DSL documented here:
 https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
+or a python function given using the -m/--module flag, or an already
+implemented typical scraping routine (listed below).
 
 It will output the scraped items as a CSV or NDJSON file.
 
@@ -1164,8 +1166,10 @@ an error occurred.
 
 Positional Arguments:
   scraper                       Path to a scraper definition file, or name of a
-                                builtin scraper, e.g. "title". See the complete
-                                list below.
+                                builtin scraper, e.g. "title" (see the complete
+                                list below), or a path to a python module and
+                                function (e.g. scraper.py,
+                                scraper.py:scrape_title).
   path_or_path_column           Single path to process or name of the CSV column
                                 containing paths when using -i/--input. Defaults
                                 to "path".
@@ -1190,6 +1194,8 @@ Optional Arguments:
   --mimetype-column MIMETYPE_COLUMN
                                 Name of the CSV column containing file mimetype.
                                 Defaults to `mimetype`.
+  -m, --module                  Whether given scraper is a python target to
+                                import.
   --plural-separator PLURAL_SEPARATOR
                                 Separator use to join lists of values when
                                 serializing to CSV. Defaults to `|`.
@@ -1263,6 +1269,15 @@ Examples:
 . Scraping a single url:
     $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i -
 
+. Using a builtin scraper:
+    $ minet scrape title -i report.csv > titles.csv
+
+. Using the `scrape` (default) function of target python module:
+    $ minet scrape scraper.py -i report.csv > titles.csv
+
+. Using the `scrape_title` function of target python module:
+    $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+
 . Indicating a custom path column (e.g. "file"):
     $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
 
@@ -1286,9 +1301,6 @@ Examples:
 
 . Keeping only some columns from input CSV file:
     $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv
-
-. Using a builtin scraper:
-    $ minet scrape title -i report.csv > titles.csv
 ```
 
 ## screenshot

diff --git a/ftest/ftest-array.sh b/ftest/ftest-array.sh
@@ -15,6 +15,8 @@ echo
 echo "Scrape"
 echo "  - Single HTML file"
 $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml $EXTRACT_DIR/article.html | wc -l
+echo "  - Single HTML file, typical scraper"
+$MINET scrape -p 1 title $EXTRACT_DIR/article.html | wc -l
 echo "  - Single glob pattern"
 $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml "$EXTRACT_DIR/*.html" -g | wc -l
 echo "  - CSV input"

diff --git a/ftest/scrapers/title.py b/ftest/scrapers/title.py
@@ -0,0 +1,10 @@
+from minet.scrape import WonderfulSoup
+from casanova import RowWrapper
+
+
+def scrape(row: RowWrapper, soup: WonderfulSoup):
+    return {"url": row.url, "title": soup.scrape_one("title")}
+
+
+def titles(row: RowWrapper, soup: WonderfulSoup):
+    yield soup.scrape_one("title")
diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
@@ -14,6 +14,8 @@ def resolve_arguments(cli_args):
         Use multiple processes to scrape data from a batch of HTML files using
         minet scraping DSL documented here:
         https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
+        or a python function given using the -m/--module flag, or an already
+        implemented typical scraping routine (listed below).
 
         It will output the scraped items as a CSV or NDJSON file.
 
@@ -61,6 +63,15 @@ def resolve_arguments(cli_args):
         . Scraping a single url:
             $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i -
 
+        . Using a builtin scraper:
+            $ minet scrape title -i report.csv > titles.csv
+
+        . Using the `scrape` (default) function of target python module:
+            $ minet scrape scraper.py -i report.csv > titles.csv
+
+        . Using the `scrape_title` function of target python module:
+            $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+
         . Indicating a custom path column (e.g. "file"):
             $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
 
@@ -84,16 +95,18 @@ def resolve_arguments(cli_args):
 
         . Keeping only some columns from input CSV file:
             $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv
-
-        . Using a builtin scraper:
-            $ minet scrape title -i report.csv > titles.csv
     """,
     resolve=resolve_arguments,
     variadic_input={"dummy_column": "path", "optional": True, "no_help": True},
     arguments=[
         {
             "name": "scraper",
-            "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title". See the complete list below.',
+            "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title" (see the complete list below), or a path to a python module and function (e.g. scraper.py, scraper.py:scrape_title).',
+        },
+        {
+            "flags": ["-m", "--module"],
+            "help": "Whether given scraper is a python target to import.",
+            "action": "store_true",
         },
         {
             "flags": ["-g", "--glob"],

diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py
@@ -13,12 +13,18 @@
 from threading import Lock
 from os.path import basename, isdir
 
-from minet.scrape import Scraper
-from minet.scrape.typical import TYPICAL_SCRAPERS
-from minet.scrape.types import ScraperBase
+from minet.utils import import_target
+from minet.scrape.classes import (
+    NAMED_SCRAPERS,
+    ScraperBase,
+    DefinitionScraper,
+    FunctionScraper,
+)
 from minet.multiprocessing import LazyPool
 from minet.exceptions import (
     DefinitionInvalidFormatError,
+    GenericModuleNotFoundError,
+    TargetInGenericModuleNotFoundError,
 )
 from minet.scrape.exceptions import (
     InvalidScraperError,
@@ -57,30 +63,19 @@ class ScrapeResult:
 
 
 SCRAPER: Optional[ScraperBase] = None
-FORMAT: Optional[str] = None
-PLURAL_SEPARATOR: Optional[str] = None
 HEADERS: Optional[casanova.headers] = None
 
 
-def init_process(options):
+def init_process(scraper: ScraperBase, fieldnames: List[str]):
     global SCRAPER
-    global FORMAT
-    global PLURAL_SEPARATOR
     global HEADERS
 
-    if options["name"] is not None:
-        SCRAPER = TYPICAL_SCRAPERS[options["name"]]()
-    else:
-        SCRAPER = Scraper(options["definition"], strain=options["strain"])
-
-    FORMAT = options["format"]
-    PLURAL_SEPARATOR = options["plural_separator"]
-    HEADERS = casanova.headers(options["fieldnames"])
+    SCRAPER = scraper
+    HEADERS = casanova.headers(fieldnames)
 
 
 def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
     assert SCRAPER is not None
-    assert PLURAL_SEPARATOR is not None
     assert HEADERS is not None
 
     text = payload.text
@@ -109,12 +104,7 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
         context["basename"] = basename(payload.path)
 
     # Attempting to scrape
-    if FORMAT == "csv":
-        items = SCRAPER.as_csv_rows(
-            text, context=context, plural_separator=PLURAL_SEPARATOR
-        )
-    else:
-        items = SCRAPER.as_records(text, context=context)
+    items = SCRAPER.items(text, context=context)
 
     # NOTE: errors might be raised when we consume the generators created above
     try:
@@ -129,19 +119,39 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
 
 
 def action(cli_args):
-    using_typical_scraper = False
-
     # Parsing scraper definition
     try:
-        if cli_args.scraper in TYPICAL_SCRAPERS:
-            using_typical_scraper = True
-            scraper = TYPICAL_SCRAPERS[cli_args.scraper]()
+        if cli_args.module:
+            fn = import_target(cli_args.scraper, default="scrape")
+            scraper = FunctionScraper(fn, strain=cli_args.strain)
+        elif cli_args.scraper in NAMED_SCRAPERS:
+            scraper = NAMED_SCRAPERS[cli_args.scraper]()
         else:
-            scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
+            scraper = DefinitionScraper(cli_args.scraper, strain=cli_args.strain)
+
+    except GenericModuleNotFoundError:
+        raise FatalError(
+            [
+                "Could not import %s!" % cli_args.scraper,
+                "Are you sure the module exists?",
+            ]
+        )
+
+    except TargetInGenericModuleNotFoundError as e:
+        raise FatalError(
+            [
+                "Could not find the %s target in the %s module!" % (e.name, e.path),
+                "Are you sure this class/function/variable exists in the module?",
+            ]
+        )
 
     except DefinitionInvalidFormatError:
         raise FatalError(
-            ["Unknown scraper format!", "It should be a JSON or YAML file."]
+            [
+                "Unknown scraper format!",
+                "It should be a JSON or YAML file.",
+                "Or did you forget the -m/--module flag?",
+            ]
         )
 
     except FileNotFoundError:
@@ -165,7 +175,7 @@ def action(cli_args):
             ]
         )
 
-    if scraper.fieldnames is None and cli_args.format == "csv":
+    if not scraper.tabular and cli_args.format == "csv":
         raise FatalError(
             [
                 "Your scraper does not yield tabular data.",
@@ -183,26 +193,54 @@ def action(cli_args):
     writer_lock = Lock()
 
     if cli_args.format == "csv":
-        assert scraper.fieldnames is not None
+        if isinstance(scraper, FunctionScraper):
+            reader = casanova.reader(cli_args.input, total=cli_args.total)
 
-        output_fieldnames = scraper.fieldnames
+            # TODO: support for inferring_enricher
+            # TODO: support forwarding cases that will yield None
+            writer = casanova.inferring_writer(
+                cli_args.output, plural_separator=cli_args.plural_separator
+            )
 
-        if cli_args.scraped_column_prefix is not None:
-            output_fieldnames = [
-                cli_args.scraped_column_prefix + h for h in output_fieldnames
-            ]
+            def writerow(row, item):
+                writer.writerow(item)
 
-        enricher = casanova.enricher(
-            cli_args.input,
-            cli_args.output,
-            total=cli_args.total,
-            select=cli_args.select,
-            add=output_fieldnames,
-        )
-        reader = enricher
+        else:
+            assert scraper.fieldnames is not None
 
-        def writerow(row, item):
-            enricher.writerow(row, item)
+            serializer = casanova.CSVSerializer(
+                plural_separator=cli_args.plural_separator
+            )
+
+            output_fieldnames = scraper.fieldnames
+
+            if cli_args.scraped_column_prefix is not None:
+                output_fieldnames = [
+                    cli_args.scraped_column_prefix + h for h in output_fieldnames
+                ]
+
+            enricher = casanova.enricher(
+                cli_args.input,
+                cli_args.output,
+                total=cli_args.total,
+                select=cli_args.select,
+                add=output_fieldnames,
+            )
+            reader = enricher
+
+            def writerow(row, item):
+                assert scraper.fieldnames is not None
+
+                if item is None:
+                    enricher.writerow(row)
+                    return
+
+                if isinstance(item, dict):
+                    item = [item.get(f) for f in scraper.fieldnames]
+                else:
+                    item = [item]
+
+                enricher.writerow(row, (serializer(v) for v in item))  # type: ignore
 
     else:
         # TODO: casanova should probably expose some ndjson enricher
@@ -254,16 +292,7 @@ def payloads() -> Iterator[ScrapeWorkerPayload]:
         pool = LazyPool(
             cli_args.processes,
             initializer=init_process,
-            initargs=(
-                {
-                    "name": cli_args.scraper if using_typical_scraper else None,
-                    "definition": getattr(scraper, "definition", None),
-                    "strain": cli_args.strain if not using_typical_scraper else None,
-                    "format": cli_args.format,
-                    "plural_separator": cli_args.plural_separator,
-                    "fieldnames": reader.fieldnames,
-                },
-            ),
+            initargs=(scraper, reader.fieldnames),
         )
 
         loading_bar.append_to_title(" (p=%i)" % pool.processes)

diff --git a/minet/scrape/__init__.py b/minet/scrape/__init__.py
@@ -4,7 +4,7 @@
 #
 # Module exposing utilities related to minet's scraping DSL.
 #
-from minet.scrape.scraper import scrape, Scraper, validate
+from minet.scrape.classes.definition import scrape, DefinitionScraper, validate
 from minet.scrape.soup import WonderfulSoup
 from minet.scrape.regex import (
     extract_encodings_from_xml,
@@ -15,7 +15,7 @@
 
 __all__ = [
     "scrape",
-    "Scraper",
+    "DefinitionScraper",
     "validate",
     "WonderfulSoup",
     "extract_encodings_from_xml",

diff --git a/minet/scrape/classes/__init__.py b/minet/scrape/classes/__init__.py
@@ -0,0 +1,4 @@
+from minet.scrape.classes.base import ScraperBase
+from minet.scrape.classes.definition import DefinitionScraper, validate, scrape
+from minet.scrape.classes.function import FunctionScraper
+from minet.scrape.classes.named import NamedScraper, NAMED_SCRAPERS