Skip to content

Commit b9efd80

Browse files
it176131Francisco Aranda
and
Francisco Aranda
authored
synset pos parameter (#15)
* modified: spacy_wordnet/wordnet_domains.py - importing Synset class and added return type hint to __find_synsets(...) method - pep8 character limit adjustment * modified: spacy_wordnet/wordnet_domains.py - added optional pos param to __find_synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - argument handling for pos param * modified: spacy_wordnet/wordnet_domains.py - swapping all(map(...)) for set(...).difference(...) which gives a slight boost in speed and readability * modified: spacy_wordnet/wordnet_domains.py - added try/except to attempt to convert pos arg to list * modified: spacy_wordnet/wordnet_domains.py - filtering acceptable_pos using pos values and assigning to token_pos which will be used to determine which tokens to get synsets for * modified: spacy_wordnet/wordnet_domains.py - moved call from self.__synsets declaration into .synsets(...) method allowing user to supply pos args * modified: spacy_wordnet/wordnet_domains.py - return type hint and docstring for synsets(...) method * modified: tests/test_wordnet_annotator.py - added three assertions for pos param in test_english_annotations() method * modified: spacy_wordnet/wordnet_domains.py - fixed error type hint in synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - fixed type error in __find_lemmas() method by swapping self.__synsets attribute with self.synsets(...) method - pep8 character limit fix in __find_lemmas() method * modified: spacy_wordnet/wordnet_domains.py - defined token_synsets as a separate list and filtered returned synsets in wn.synsets and extending token_synsets in __find_synsets(...) method * modified: tests/test_wordnet_annotator.py changed expected_adj_synsets to set() instead of {} (a dict) in test_english_annotations() method * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in synsets(...) method Co-authored-by: Francisco Aranda <[email protected]> * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in __find_synsets(...) method Co-authored-by: Francisco Aranda <[email protected]> * use token.pos if pos argument is none to mimic previous behavior. Co-authored-by: Francisco Aranda <[email protected]> * Update wordnet_domains.py modified docstring to reflect what happens if pos argument is none * modified: tests/test_wordnet_annotator.py - added assert to test that list of pos args will return expected results * modified: tests/test_wordnet_annotator.py - added test for when pos argument is none * Update spacy_wordnet/wordnet_domains.py Checking if `token.pos` is an acceptable value before appending its lemma to the `word_variants` list. This avoids unexpected results such as when `token.pos` is an `ADVERB`. Co-authored-by: Francisco Aranda <[email protected]> * Update wordnet_domains.py Updated docstring so user knows results are limited to NOUN, VERB, and ADJ even if `pos` is None. Co-authored-by: Ian Thompson <[email protected]> Co-authored-by: Francisco Aranda <[email protected]>
1 parent 4bc9fe0 commit b9efd80

File tree

2 files changed

+94
-14
lines changed

2 files changed

+94
-14
lines changed

spacy_wordnet/wordnet_domains.py

+48-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
from typing import Union
12
from nltk.corpus import wordnet as wn
3+
from nltk.corpus.reader.wordnet import Synset
24
from spacy.tokens.token import Token
35

46
from spacy_wordnet.__utils__ import *
@@ -43,16 +45,28 @@ class Wordnet(object):
4345
def __init__(self, token: Token, lang: str = "es"):
4446
self.__token = token
4547
self.__lang = fetch_wordnet_lang(lang)
46-
self.__synsets = self.__find_synsets(token, self.__lang)
48+
self.__synsets = self.__find_synsets
4749
self.__lemmas = self.__find_lemmas()
4850
self.__wordnet_domains = self.__find_wordnet_domains()
4951

52+
def synsets(self, pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
53+
"""
54+
Load all synsets with a given part of speech tag.
55+
If no pos is specified and `token.pos` is a VERB, NOUN,
56+
or ADJ, synsets with the same parts of speech as
57+
`token.pos` will be loaded. If `token.pos` is not a
58+
VERB, NOUN, or ADJ and no pos is specified, an empty
59+
list will be returned.
60+
61+
:param pos: filter returned synsets by part(s) of speech.
62+
Acceptable values are "verb", "noun", and "adj".
63+
:return: list of synsets
64+
"""
65+
return self.__synsets(self.__token, self.__lang, pos=pos)
66+
5067
def lang(self):
5168
return self.__lang
5269

53-
def synsets(self):
54-
return self.__synsets
55-
5670
def lemmas(self):
5771
return self.__lemmas
5872

@@ -68,16 +82,40 @@ def wordnet_synsets_for_domain(self, domains: List[str]):
6882
]
6983

7084
@staticmethod
71-
def __find_synsets(token: Token, lang: str):
85+
def __find_synsets(token: Token,
86+
lang: str,
87+
pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
88+
if pos is None:
89+
pos = []
90+
elif isinstance(pos, str):
91+
pos = [pos]
92+
elif not isinstance(pos, list):
93+
try:
94+
pos = list(pos)
95+
except TypeError:
96+
raise TypeError("pos argument must be None, type str, or type list.")
97+
98+
acceptable_pos = {"verb": VERB, "noun": NOUN, "adj": ADJ} # We can define this as a private class constant
99+
# check if any element in `pos` is not in `acceptable_pos`
100+
if set(pos).difference(acceptable_pos):
101+
raise ValueError("pos argument must be a combination of 'verb', "
102+
"'noun', or 'adj'.")
103+
104+
token_pos: List[int] = [acceptable_pos[k] for k in pos]
105+
if not token_pos:
106+
token_pos = [token.pos]
72107
word_variants = [token.text]
73-
if token.pos in [VERB, NOUN, ADJ]:
108+
if token.pos in (token_pos if pos else acceptable_pos.values()):
74109
# extend synset coverage using lemmas
75110
word_variants.append(token.lemma_)
76111

77112
for word in word_variants:
78-
token_synsets = wn.synsets(
79-
word, pos=spacy2wordnet_pos(token.pos), lang=lang
80-
)
113+
token_synsets: List[Synset] = []
114+
for p in token_pos:
115+
token_synsets.extend(wn.synsets(
116+
word, pos=spacy2wordnet_pos(p), lang=lang
117+
))
118+
81119
if token_synsets:
82120
return token_synsets
83121

@@ -95,8 +133,4 @@ def __find_wordnet_domains(self):
95133
]
96134

97135
def __find_lemmas(self):
98-
return [
99-
lemma
100-
for synset in self.synsets()
101-
for lemma in synset.lemmas(lang=self.__lang)
102-
]
136+
return [lemma for synset in self.synsets() for lemma in synset.lemmas(lang=self.__lang)]

tests/test_wordnet_annotator.py

+46
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import unittest
22

3+
from nltk.corpus import wordnet as wn
34
import spacy
45

56
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
@@ -28,6 +29,51 @@ def test_english_annotations(self):
2829
assert token._.wordnet.lemmas()
2930
assert token._.wordnet.wordnet_domains()
3031

32+
actual_none_synsets = set(token._.wordnet.synsets(pos=None))
33+
expected_none_synsets = {wn.synset("contract.n.01"),
34+
wn.synset("contract.n.02"),
35+
wn.synset("contract.n.03")}
36+
assert actual_none_synsets == expected_none_synsets
37+
38+
actual_verb_synsets = set(token._.wordnet.synsets(pos="verb"))
39+
expected_verb_synsets = {wn.synset('abridge.v.01'),
40+
wn.synset('compress.v.02'),
41+
wn.synset('condense.v.07'),
42+
wn.synset('contract.v.01'),
43+
wn.synset('contract.v.04'),
44+
wn.synset('contract.v.06'),
45+
wn.synset('narrow.v.01'),
46+
wn.synset('shrink.v.04'),
47+
wn.synset('sign.v.04')}
48+
assert actual_verb_synsets == expected_verb_synsets
49+
50+
actual_noun_synsets = set(token._.wordnet.synsets(pos="noun"))
51+
expected_noun_synsets = {wn.synset('contract.n.01'),
52+
wn.synset('contract.n.02'),
53+
wn.synset('contract.n.03')}
54+
assert actual_noun_synsets == expected_noun_synsets
55+
56+
actual_adj_synsets = set(token._.wordnet.synsets(pos="adj"))
57+
expected_adj_synsets = set()
58+
assert actual_adj_synsets == expected_adj_synsets
59+
60+
actual_verb_noun_synsets = set(token._.wordnet.synsets(
61+
pos=["verb", "noun"])
62+
)
63+
expected_verb_noun_synsets = {wn.synset('abridge.v.01'),
64+
wn.synset('compress.v.02'),
65+
wn.synset('condense.v.07'),
66+
wn.synset('contract.v.01'),
67+
wn.synset('contract.v.04'),
68+
wn.synset('contract.v.06'),
69+
wn.synset('narrow.v.01'),
70+
wn.synset('shrink.v.04'),
71+
wn.synset('sign.v.04'),
72+
wn.synset('contract.n.01'),
73+
wn.synset('contract.n.02'),
74+
wn.synset('contract.n.03')}
75+
assert actual_verb_noun_synsets == expected_verb_noun_synsets
76+
3177
def test_generate_variants_from_domain_list(self):
3278

3379
economy_domains = ["finance", "banking"]

0 commit comments

Comments
 (0)