Adding -e/--eval flag to scrape command

Related to #922
medialab · Dec 20, 2023 · be4554f · be4554f
1 parent 0d7f03c
commit be4554f
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 13 deletions.
diff --git a/docs/cli.md b/docs/cli.md
@@ -1121,9 +1121,10 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/
 
 ```
 Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
-                    [--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES]
-                    [--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN]
-                    [--url-column URL_COLUMN] [--error-column ERROR_COLUMN]
+                    [--simple-progress] [-m] [-e] [-g] [-I INPUT_DIR]
+                    [-p PROCESSES] [--chunk-size CHUNK_SIZE]
+                    [--body-column BODY_COLUMN] [--url-column URL_COLUMN]
+                    [--error-column ERROR_COLUMN]
                     [--status-column STATUS_COLUMN]
                     [--encoding-column ENCODING_COLUMN]
                     [--mimetype-column MIMETYPE_COLUMN] [--encoding ENCODING]
@@ -1138,8 +1139,9 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
 Use multiple processes to scrape data from a batch of HTML files using
 minet scraping DSL documented here:
 https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
-or a python function given using the -m/--module flag, or an already
-implemented typical scraping routine (listed below).
+or a python function given using the -m/--module flag, or a simple inline
+python expression given using the -e/--eval flag, or an already implemented
+typical scraping routine (listed below).
 
 It will output the scraped items as a CSV or NDJSON file.
 
@@ -1186,6 +1188,8 @@ Optional Arguments:
                                 Defaults to `encoding`.
   --error-column ERROR_COLUMN   Name of the CSV column containing a fetch error.
                                 Defaults to `fetch_error`.
+  -e, --eval                    Whether given scraper should be a simple
+                                expression to evaluate.
   -f, --format {csv,jsonl,ndjson}
                                 Output format. Defaults to `csv`.
   -g, --glob                    Will interpret given paths as glob patterns to
@@ -1273,10 +1277,13 @@ Examples:
     $ minet scrape title -i report.csv > titles.csv
 
 . Using the `scrape` (default) function of target python module:
-    $ minet scrape scraper.py -i report.csv > titles.csv
+    $ minet scrape -m scraper.py -i report.csv > titles.csv
 
 . Using the `scrape_title` function of target python module:
-    $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+    $ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv
+
+. Using an inline python expression to evaluate:
+    $ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv
 
 . Indicating a custom path column (e.g. "file"):
     $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv

diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
@@ -14,8 +14,9 @@ def resolve_arguments(cli_args):
         Use multiple processes to scrape data from a batch of HTML files using
         minet scraping DSL documented here:
         https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
-        or a python function given using the -m/--module flag, or an already
-        implemented typical scraping routine (listed below).
+        or a python function given using the -m/--module flag, or a simple inline
+        python expression given using the -e/--eval flag, or an already implemented
+        typical scraping routine (listed below).
 
         It will output the scraped items as a CSV or NDJSON file.
 
@@ -67,10 +68,13 @@ def resolve_arguments(cli_args):
             $ minet scrape title -i report.csv > titles.csv
 
         . Using the `scrape` (default) function of target python module:
-            $ minet scrape scraper.py -i report.csv > titles.csv
+            $ minet scrape -m scraper.py -i report.csv > titles.csv
 
         . Using the `scrape_title` function of target python module:
-            $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+            $ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv
+
+        . Using an inline python expression to evaluate:
+            $ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv
 
         . Indicating a custom path column (e.g. "file"):
             $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
@@ -108,6 +112,11 @@ def resolve_arguments(cli_args):
             "help": "Whether given scraper is a python target to import.",
             "action": "store_true",
         },
+        {
+            "flags": ["-e", "--eval"],
+            "help": "Whether given scraper should be a simple expression to evaluate.",
+            "action": "store_true"
+        },
         {
             "flags": ["-g", "--glob"],
             "help": "Will interpret given paths as glob patterns to resolve if given.",

diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py
@@ -124,6 +124,8 @@ def action(cli_args):
         if cli_args.module:
             fn = import_target(cli_args.scraper, default="scrape")
             scraper = FunctionScraper(fn, strain=cli_args.strain)
+        elif cli_args.eval:
+            scraper = FunctionScraper(cli_args.scraper, strain=cli_args.strain)
         elif cli_args.scraper in NAMED_SCRAPERS:
             scraper = NAMED_SCRAPERS[cli_args.scraper]()
         else:

diff --git a/minet/scrape/classes/function.py b/minet/scrape/classes/function.py
@@ -36,7 +36,7 @@ def infer_fieldnames_from_function_return_type(fn: Callable) -> Optional[List[st
 
 
 class FunctionScraper(ScraperBase):
-    fn: Callable[[RowWrapper, WonderfulSoup], Any]
+    fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]]
     fieldnames = None
     plural: bool
     tabular = True
@@ -45,9 +45,10 @@ class FunctionScraper(ScraperBase):
 
     def __init__(
         self,
-        fn: Callable[[RowWrapper, WonderfulSoup], Any],
+        fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]],
         strain: Optional[str] = None,
     ):
+        # NOTE: closures cannot be pickled without using third-party library `dill`.
         self.fn = fn
         self.plural = inspect.isgeneratorfunction(fn)
 
@@ -62,4 +63,7 @@ def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None):
         row = context["row"]
         soup = cast(WonderfulSoup, ensure_soup(html, strainer=self.strainer))
 
+        if isinstance(self.fn, str):
+            return eval(self.fn, {"row": row, 'soup': soup}, None)
+
         return self.fn(row, soup)