forked from explosion/prodigy-recipes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner_match.py
63 lines (56 loc) · 2.46 KB
/
ner_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.models.matcher import PatternMatcher
from prodigy.components.db import connect
from prodigy.util import split_string
import spacy
from typing import List, Optional
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"ner.match",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
patterns=("Optional match patterns", "option", "p", str),
exclude=("Names of datasets to exclude", "option", "e", split_string),
resume=("Resume from existing dataset and update matcher", "flag", "R", bool),
)
def ner_match(
dataset: str,
spacy_model: str,
source: str,
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
resume: bool = False,
):
"""
Suggest phrases that match a given patterns file, and mark whether they
are examples of the entity you're interested in. The patterns file can
include exact strings or token patterns for use with spaCy's `Matcher`.
"""
# Load the spaCy model
nlp = spacy.load(spacy_model)
# Initialize the pattern matcher and load in the JSONL patterns
matcher = PatternMatcher(nlp).from_disk(patterns)
if resume:
# Connect to the database using the settings from prodigy.json
DB = connect()
if dataset and dataset in DB:
# Get the existing annotations and update the matcher
existing = DB.get_dataset(dataset)
matcher.update(existing)
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
# Apply the matcher to the stream, which returns (score, example) tuples.
# Filter out the scores to only yield the examples for annotations.
stream = (eg for score, eg in matcher(stream))
return {
"view_id": "ner", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": {"lang": nlp.lang}, # Additional config settings, mostly for app UI
}