From c06e8c64a6da618c132c9b0d3f07b4b5057491ed Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Fri, 15 Dec 2023 16:55:19 +0100 Subject: [PATCH] Porting scrape & crawl to newest inferring writer NOTE: scrape -m is still unsafe --- ftest/scrapers/title.py | 6 +----- minet/cli/crawl/crawl.py | 4 ++-- minet/cli/scrape/scrape.py | 19 ++++++++++++------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/ftest/scrapers/title.py b/ftest/scrapers/title.py index 1829ec9503..63a9b25143 100644 --- a/ftest/scrapers/title.py +++ b/ftest/scrapers/title.py @@ -3,8 +3,4 @@ def scrape(row: RowWrapper, soup: WonderfulSoup): - return {"url": row.url, "title": soup.scrape_one("title")} - - -def titles(row: RowWrapper, soup: WonderfulSoup): - yield soup.scrape_one("title") + return soup.scrape_one("title") diff --git a/minet/cli/crawl/crawl.py b/minet/cli/crawl/crawl.py index 23cc89984b..3012311a01 100644 --- a/minet/cli/crawl/crawl.py +++ b/minet/cli/crawl/crawl.py @@ -85,7 +85,7 @@ def __add_file(self, name: Optional[str], path: str, spider): if self.format == "csv": # TODO: ability to pass fieldnames? from spider? - w = casanova.InferringWriter(f, add=["job_id"]) + w = casanova.InferringWriter(f, prepend=["job_id"]) elif self.format == "jsonl" or self.format == "ndjson": w = ndjson.writer(f) else: @@ -97,7 +97,7 @@ def __unpack_result(self, result: SuccessfulCrawlResult, data): job_id = result.job.id if self.format == "csv": - return (data, [job_id]) + return ([job_id], data) return ({"job_id": job_id, "data": data},) diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py index c287079d3f..741512600b 100644 --- a/minet/cli/scrape/scrape.py +++ b/minet/cli/scrape/scrape.py @@ -194,16 +194,19 @@ def action(cli_args): if cli_args.format == "csv": if isinstance(scraper, FunctionScraper): - reader = casanova.reader(cli_args.input, total=cli_args.total) - - # TODO: support for inferring_enricher - # TODO: support forwarding cases that will yield None - writer = casanova.inferring_writer( - cli_args.output, plural_separator=cli_args.plural_separator + enricher = casanova.inferring_enricher( + cli_args.input, + cli_args.output, + total=cli_args.total, + plural_separator=cli_args.plural_separator, + select=cli_args.select, + mapping_sample_size=512, + buffer_optionals=True, ) + reader = enricher def writerow(row, item): - writer.writerow(item) + enricher.writerow(row, item) else: assert scraper.fieldnames is not None @@ -356,6 +359,8 @@ def payloads() -> Iterator[ScrapeWorkerPayload]: assert result.items is not None items = result.items + print(items) + with writer_lock: for item in items: writerow(original_item.row, item)