Skip to content

Commit

Permalink
Include span information when converting to JSONL
Browse files Browse the repository at this point in the history
This is useful when we want to hydrate as Prodigy dataset
with the data we need.
  • Loading branch information
ljvmiranda921 committed Feb 23, 2023
1 parent 76b1834 commit 8c984fc
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions integrations/prodigy_openai/scripts/convert_to_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,20 @@ def convert_to_jsonl(
# fmt: on
):
"""Convert spaCy file into JSONL to use for Prodigy"""

nlp = spacy.blank(lang)
doc_bin = DocBin().from_disk(input_path)
docs = list(doc_bin.get_docs(nlp.vocab))

texts = [{"text": doc.text} for doc in docs]
msg.text(f"Found {len(texts)} documents in {input_path}")
records = []
for doc in doc_bin.get_docs(nlp.vocab):
spans = [
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
for ent in doc.ents
]
records.append({"text": doc.text, "spans": spans})

msg.text(f"Found {len(records)} documents in {input_path}")

srsly.write_jsonl(output_path, texts)
srsly.write_jsonl(output_path, records)
msg.good(f"Saved texts to {output_path}")


Expand Down

0 comments on commit 8c984fc

Please sign in to comment.