Skip to content

Commit

Permalink
Adding -e/--eval flag to scrape command
Browse files Browse the repository at this point in the history
Related to #922
  • Loading branch information
Yomguithereal committed Dec 20, 2023
1 parent 0d7f03c commit be4554f
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 13 deletions.
21 changes: 14 additions & 7 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -1121,9 +1121,10 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/

```
Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
[--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES]
[--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN]
[--url-column URL_COLUMN] [--error-column ERROR_COLUMN]
[--simple-progress] [-m] [-e] [-g] [-I INPUT_DIR]
[-p PROCESSES] [--chunk-size CHUNK_SIZE]
[--body-column BODY_COLUMN] [--url-column URL_COLUMN]
[--error-column ERROR_COLUMN]
[--status-column STATUS_COLUMN]
[--encoding-column ENCODING_COLUMN]
[--mimetype-column MIMETYPE_COLUMN] [--encoding ENCODING]
Expand All @@ -1138,8 +1139,9 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
Use multiple processes to scrape data from a batch of HTML files using
minet scraping DSL documented here:
https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
or a python function given using the -m/--module flag, or an already
implemented typical scraping routine (listed below).
or a python function given using the -m/--module flag, or a simple inline
python expression given using the -e/--eval flag, or an already implemented
typical scraping routine (listed below).
It will output the scraped items as a CSV or NDJSON file.
Expand Down Expand Up @@ -1186,6 +1188,8 @@ Optional Arguments:
Defaults to `encoding`.
--error-column ERROR_COLUMN Name of the CSV column containing a fetch error.
Defaults to `fetch_error`.
-e, --eval Whether given scraper should be a simple
expression to evaluate.
-f, --format {csv,jsonl,ndjson}
Output format. Defaults to `csv`.
-g, --glob Will interpret given paths as glob patterns to
Expand Down Expand Up @@ -1273,10 +1277,13 @@ Examples:
$ minet scrape title -i report.csv > titles.csv
. Using the `scrape` (default) function of target python module:
$ minet scrape scraper.py -i report.csv > titles.csv
$ minet scrape -m scraper.py -i report.csv > titles.csv
. Using the `scrape_title` function of target python module:
$ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
$ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv
. Using an inline python expression to evaluate:
$ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv
. Indicating a custom path column (e.g. "file"):
$ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
Expand Down
17 changes: 13 additions & 4 deletions minet/cli/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ def resolve_arguments(cli_args):
Use multiple processes to scrape data from a batch of HTML files using
minet scraping DSL documented here:
https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
or a python function given using the -m/--module flag, or an already
implemented typical scraping routine (listed below).
or a python function given using the -m/--module flag, or a simple inline
python expression given using the -e/--eval flag, or an already implemented
typical scraping routine (listed below).
It will output the scraped items as a CSV or NDJSON file.
Expand Down Expand Up @@ -67,10 +68,13 @@ def resolve_arguments(cli_args):
$ minet scrape title -i report.csv > titles.csv
. Using the `scrape` (default) function of target python module:
$ minet scrape scraper.py -i report.csv > titles.csv
$ minet scrape -m scraper.py -i report.csv > titles.csv
. Using the `scrape_title` function of target python module:
$ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
$ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv
. Using an inline python expression to evaluate:
$ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv
. Indicating a custom path column (e.g. "file"):
$ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
Expand Down Expand Up @@ -108,6 +112,11 @@ def resolve_arguments(cli_args):
"help": "Whether given scraper is a python target to import.",
"action": "store_true",
},
{
"flags": ["-e", "--eval"],
"help": "Whether given scraper should be a simple expression to evaluate.",
"action": "store_true"
},
{
"flags": ["-g", "--glob"],
"help": "Will interpret given paths as glob patterns to resolve if given.",
Expand Down
2 changes: 2 additions & 0 deletions minet/cli/scrape/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def action(cli_args):
if cli_args.module:
fn = import_target(cli_args.scraper, default="scrape")
scraper = FunctionScraper(fn, strain=cli_args.strain)
elif cli_args.eval:
scraper = FunctionScraper(cli_args.scraper, strain=cli_args.strain)
elif cli_args.scraper in NAMED_SCRAPERS:
scraper = NAMED_SCRAPERS[cli_args.scraper]()
else:
Expand Down
8 changes: 6 additions & 2 deletions minet/scrape/classes/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def infer_fieldnames_from_function_return_type(fn: Callable) -> Optional[List[st


class FunctionScraper(ScraperBase):
fn: Callable[[RowWrapper, WonderfulSoup], Any]
fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]]
fieldnames = None
plural: bool
tabular = True
Expand All @@ -45,9 +45,10 @@ class FunctionScraper(ScraperBase):

def __init__(
self,
fn: Callable[[RowWrapper, WonderfulSoup], Any],
fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]],
strain: Optional[str] = None,
):
# NOTE: closures cannot be pickled without using third-party library `dill`.
self.fn = fn
self.plural = inspect.isgeneratorfunction(fn)

Expand All @@ -62,4 +63,7 @@ def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None):
row = context["row"]
soup = cast(WonderfulSoup, ensure_soup(html, strainer=self.strainer))

if isinstance(self.fn, str):
return eval(self.fn, {"row": row, 'soup': soup}, None)

return self.fn(row, soup)

0 comments on commit be4554f

Please sign in to comment.