Skip to content

Commit

Permalink
Corpus - Remove dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 1, 2023
1 parent 13429e5 commit 2b48e94
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 38 deletions.
24 changes: 5 additions & 19 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
"""
self.text_features = [] # list of text features for mining
self._tokens = None
self._dictionary = None
self.ngram_range = (1, 1)
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
Expand Down Expand Up @@ -382,13 +381,12 @@ def documents_from_features(self, feats):
return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row))
for row in data.metas]

def store_tokens(self, tokens, dictionary=None):
def store_tokens(self, tokens):
"""
Args:
tokens (list): List of lists containing tokens.
"""
self._tokens = np.array(tokens, dtype=object)
self._dictionary = dictionary or corpora.Dictionary(self.tokens)

@property
def tokens(self):
Expand All @@ -397,7 +395,7 @@ def tokens(self):
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
return self._base_tokens()[0]
return self._base_tokens()
return self._tokens

def has_tokens(self):
Expand All @@ -409,19 +407,9 @@ def _base_tokens(self):
BASE_TOKENIZER, PreprocessorList

# don't use anything that requires NLTK data to assure async download
base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
BASE_TOKENIZER])
base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
corpus = base_preprocessors(self)
return corpus.tokens, corpus.dictionary

@property
def dictionary(self):
"""
corpora.Dictionary: A token to id mapper.
"""
if self._dictionary is None:
return self._base_tokens()[1]
return self._dictionary
return corpus.tokens

@property
def pos_tags(self):
Expand Down Expand Up @@ -476,10 +464,9 @@ def ngrams(self):
def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(text_features=copy(self.text_features))
# since tokens are considered immutable copies are not needed
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
c.pos_tags = self.pos_tags
c.name = self.name
Expand Down Expand Up @@ -640,7 +627,6 @@ def retain_preprocessing(orig, new, key=...):
new.pos_tags = orig.pos_tags
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary

if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
Expand Down
15 changes: 4 additions & 11 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
callback(0, "Filtering...")
filtered_tokens = []
filtered_tags = []
Expand All @@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
if corpus.pos_tags is not None:
filtered_tags.append(list(compress(corpus.pos_tags[i],
filter_map)))
if dictionary is None:
corpus.store_tokens(filtered_tokens)
else:
corpus.store_tokens(filtered_tokens, dictionary)
corpus.store_tokens(filtered_tokens)
if filtered_tags:
corpus.pos_tags = np.array(filtered_tags, dtype=object)
return corpus
Expand Down Expand Up @@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
def _fit(self, corpus: Corpus):
raise NotImplemented

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
corpus = super()._filter_tokens(corpus, callback,
dictionary=self._dictionary)
return corpus
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
return super()._filter_tokens(corpus, callback)

def _check(self, token):
assert self._lexicon is not None
Expand Down
8 changes: 0 additions & 8 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ def test_extend_attributes_keep_preprocessing(self):

self.assertEqual(len(new_c._tokens), len(c))
np.testing.assert_equal(new_c._tokens, new_c._tokens)
self.assertEqual(new_c._dictionary, c._dictionary)
self.assertEqual(new_c.text_features, c.text_features)
self.assertEqual(new_c.ngram_range, c.ngram_range)
self.assertEqual(new_c.attributes, c.attributes)
Expand Down Expand Up @@ -406,20 +405,17 @@ def test_getitem(self):
self.assertEqual(len(sel), 1)
self.assertEqual(len(sel._tokens), 1)
np.testing.assert_equal(sel._tokens, np.array([c._tokens[0]]))
self.assertEqual(sel._dictionary, c._dictionary)

sel = c[0:5]
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
self.assertEqual(sel._dictionary, c._dictionary)

ind = [3, 4, 5, 6]
sel = c[ind]
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
Expand All @@ -429,7 +425,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
Expand All @@ -439,7 +434,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[list(ind)])
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
Expand All @@ -448,7 +442,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(c))
self.assertEqual(len(sel._tokens), len(c))
np.testing.assert_equal(sel._tokens, c._tokens)
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
Expand All @@ -457,7 +450,6 @@ def test_getitem(self):
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
self.assertEqual(sel._dictionary, c._dictionary)

def test_set_text_features(self):
c = Corpus.from_file('friends-transcripts')[:100]
Expand Down

0 comments on commit 2b48e94

Please sign in to comment.