diff --git a/textblob/blob.py b/textblob/blob.py index f8182f20..b74058f3 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -23,6 +23,7 @@ from __future__ import unicode_literals, absolute_import import sys import json +import re from collections import defaultdict import nltk @@ -368,6 +369,16 @@ def __init__(self, text, tokenizer=None, _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier) + @cached_property + def remove_stopwords(self,fname="stopwords.txt"): + """Take raw string and remove stop words. + :rtype: str + """ + with open(fname) as f: + stopwords = [word for line in f for word in re.findall(r'\w+', line)] + clean_string = ' '.join([word for word in self.raw.split() if word not in stopwords]) + return clean_string + @cached_property def words(self): """Return a list of word tokens. This excludes punctuation characters.