Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Python package

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .[all]
- name: Run tests
run: |
PYTHONPATH=. pytest -q
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,14 @@ Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_

* Python 3

The following software packages are dependencies and will be installed automatically.
The library installs its core dependencies automatically. Optional extras can be
installed for additional augmenters.

```shell
$ pip install numpy nltk gensim==3.8.3 textblob googletrans
$ pip install numpy nltk textblob
# Install extras
$ pip install 'textaugment[word2vec]' # requires gensim
$ pip install 'textaugment[translate]' # requires googletrans

```
The following code downloads NLTK corpus for [wordnet](http://www.nltk.org/howto/wordnet.html).
Expand Down
7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
gensim>=4.0
googletrans>=2
# Core requirements
nltk
numpy
textblob

# Optional features
gensim>=4.0 # for Word2Vec augmenter
googletrans>=4.0.2 # for Translate augmenter
9 changes: 7 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,13 @@ def read(fname):
description='A library for augmenting text for natural language processing applications.',
long_description=read("README.md"),
long_description_content_type="text/markdown",
install_requires=['nltk', 'gensim>=4.0', 'textblob', 'numpy', 'googletrans>=2'],
classifiers=[
install_requires=['nltk', 'textblob', 'numpy'],
extras_require={
'word2vec': ['gensim>=4.0'],
'translate': ['googletrans>=4.0.2'],
'all': ['gensim>=4.0', 'googletrans>=4.0.2']
},
classifiers=[
"Intended Audience :: Developers",
"Natural Language :: English",
"License :: OSI Approved :: MIT License",
Expand Down
17 changes: 10 additions & 7 deletions tests/test_word2vec.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
import unittest
import sys
from textaugment.word2vec import Word2vec
from gensim.test.utils import common_texts
from gensim.models import Word2Vec


class InputTestCase(unittest.TestCase):

def setUp(self):
self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
self.wrongpath = "/home/tjs/dev/papu/models/gensim_cbow_sepedi-wrong"
self.w = Word2vec(model=self.path)
# create a tiny model for testing
self.model = Word2Vec(common_texts, vector_size=20, min_count=1)
self.wrongpath = "/tmp/non_existent_model"
self.w = Word2vec(model=self.model)

def test_augment(self):
with self.assertRaises(TypeError, msg="Value for p should be float"):
Word2vec(model=self.path, p="foo")
Word2vec(model=self.model, p="foo")

with self.assertRaises(TypeError, msg="Value for runs should be integer"):
Word2vec(model=self.path, runs="foo")
Word2vec(model=self.model, runs="foo")

with self.assertRaises(FileNotFoundError, msg="The model is not found"):
Word2vec(model=self.wrongpath)
Expand All @@ -30,8 +33,8 @@ def test_augment(self):
class OutputTestCase(unittest.TestCase):

def setUp(self):
self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
self.w = Word2vec(model=self.path)
self.model = Word2Vec(common_texts, vector_size=20, min_count=1)
self.w = Word2vec(model=self.model)
self.data = "We are testing"

def test_augment(self):
Expand Down
9 changes: 9 additions & 0 deletions tests/test_wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
import sys
import numpy as np
from textaugment.wordnet import Wordnet
import nltk


class InputTestCase(unittest.TestCase):
def setUp(self):
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
self.p = 0.8
self.data = ["I", "am", "testing"]
self.w = Wordnet(p=self.p)
Expand All @@ -31,6 +36,10 @@ def test_augment(self):
class OutputTestCase(unittest.TestCase):

def setUp(self):
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
self.p = 0.8
self.data = ["I", "am", "testing"]
self.data2 = "известен още с псевдонимите"
Expand Down
40 changes: 27 additions & 13 deletions textaugment/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@

from .constants import LANGUAGES
from textblob import TextBlob
from textblob.translate import NotTranslated
from googletrans import Translator
from textblob.exceptions import NotTranslated
try:
from googletrans import Translator
except Exception: # googletrans might not be installed
Translator = None


class Translate:
Expand Down Expand Up @@ -131,17 +134,28 @@ def augment(self, data):
"""
if type(data) is not str:
raise TypeError("DataType must be a string")
data = TextBlob(data.lower())
try:
data = data.translate(from_lang=self.src, to=self.to)
data = data.translate(from_lang=self.to, to=self.src)
except NotTranslated:
try: # Switch to googletrans to do translation.
txt = data
blob = TextBlob(txt)

# TextBlob removed builtin translation in >0.17, so guard the call.
translated = None
if hasattr(blob, "translate"):
try:
translated = blob.translate(from_lang=self.src, to=self.to)
translated = translated.translate(from_lang=self.to, to=self.src)
except NotTranslated:
translated = None
except Exception:
translated = None

if translated is None and Translator is not None:
try: # Fallback to googletrans
translator = Translator()
data = translator.translate(data, dest=self.to, src=self.src).text
data = translator.translate(data, dest=self.src, src=self.to).text
translated = translator.translate(txt, dest=self.to, src=self.src).text
translated = translator.translate(translated, dest=self.src, src=self.to).text
except Exception:
print("Error Not translated.\n")
raise
translated = txt
elif translated is None:
translated = txt

return str(data).lower()
return str(translated)
2 changes: 1 addition & 1 deletion textaugment/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(self, **kwargs):
self.model = gensim.models.Word2Vec.load(self.model) # load word2vec or fasttext model
except FileNotFoundError:
print("Error: Model not found. Verify the path.\n")
raise ValueError("Error: Model not found. Verify the path.")
raise FileNotFoundError("Error: Model not found. Verify the path.")

def geometric(self, data):
"""
Expand Down