Skip to content

Commit

Permalink
Porting scrape & crawl to newest inferring writer
Browse files Browse the repository at this point in the history
NOTE: scrape -m is still unsafe
  • Loading branch information
Yomguithereal committed Dec 15, 2023
1 parent 8b80fef commit c06e8c6
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 14 deletions.
6 changes: 1 addition & 5 deletions ftest/scrapers/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,4 @@


def scrape(row: RowWrapper, soup: WonderfulSoup):
return {"url": row.url, "title": soup.scrape_one("title")}


def titles(row: RowWrapper, soup: WonderfulSoup):
yield soup.scrape_one("title")
return soup.scrape_one("title")
4 changes: 2 additions & 2 deletions minet/cli/crawl/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __add_file(self, name: Optional[str], path: str, spider):

if self.format == "csv":
# TODO: ability to pass fieldnames? from spider?
w = casanova.InferringWriter(f, add=["job_id"])
w = casanova.InferringWriter(f, prepend=["job_id"])
elif self.format == "jsonl" or self.format == "ndjson":
w = ndjson.writer(f)
else:
Expand All @@ -97,7 +97,7 @@ def __unpack_result(self, result: SuccessfulCrawlResult, data):
job_id = result.job.id

if self.format == "csv":
return (data, [job_id])
return ([job_id], data)

return ({"job_id": job_id, "data": data},)

Expand Down
19 changes: 12 additions & 7 deletions minet/cli/scrape/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,16 +194,19 @@ def action(cli_args):

if cli_args.format == "csv":
if isinstance(scraper, FunctionScraper):
reader = casanova.reader(cli_args.input, total=cli_args.total)

# TODO: support for inferring_enricher
# TODO: support forwarding cases that will yield None
writer = casanova.inferring_writer(
cli_args.output, plural_separator=cli_args.plural_separator
enricher = casanova.inferring_enricher(
cli_args.input,
cli_args.output,
total=cli_args.total,
plural_separator=cli_args.plural_separator,
select=cli_args.select,
mapping_sample_size=512,
buffer_optionals=True,
)
reader = enricher

def writerow(row, item):
writer.writerow(item)
enricher.writerow(row, item)

else:
assert scraper.fieldnames is not None
Expand Down Expand Up @@ -356,6 +359,8 @@ def payloads() -> Iterator[ScrapeWorkerPayload]:
assert result.items is not None
items = result.items

print(items)

with writer_lock:
for item in items:
writerow(original_item.row, item)
Expand Down

0 comments on commit c06e8c6

Please sign in to comment.