Skip to content

Commit

Permalink
[wip] Update
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 committed Apr 18, 2023
1 parent 1a95234 commit 0befd99
Showing 1 changed file with 48 additions and 0 deletions.
48 changes: 48 additions & 0 deletions integrations/prodigy_openai/scripts/get_batches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from collections import Counter
from pathlib import Path

import typer
from wasabi import msg

import spacy
from spacy.tokens import Doc, DocBin

Arg = typer.Argument
Opt = typer.Option


def get_distribution(
# fmt: off
input_path: Path = typer.Argument(..., help="Path to the spaCy file."),
n: int = typer.Option(5, "-n", "--top-n", help="Top-n entities to include in the report."),
# fmt: on
):
"""Get the distribution of entities given a spaCy file"""
nlp = spacy.blank("en")
doc_bin = DocBin().from_disk(input_path)
docs = list(doc_bin.get_docs(nlp.vocab))

# Get the entity counts
num_docs = len(docs)
msg.info(f"Found {num_docs} documents in {input_path}")
entity_counts = Counter()
for doc in docs:
for ent in doc.ents:
if ent.label_ not in entity_counts:
entity_counts[ent.label_] = 0
else:
entity_counts[ent.label_] += 1

# Get the distribution (normalize everything)
total = sum(entity_counts.values())
_fmt_counts = " ".join(
[
f"{ent} ({(count / total) * 100:.2f}%)"
for ent, count in entity_counts.most_common(n)
]
)
msg.text(f"Top-{n} entities by count: {_fmt_counts}")


if __name__ == "__main__":
typer.run(get_distribution)

0 comments on commit 0befd99

Please sign in to comment.