This repository has been archived by the owner on Oct 2, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_remove_stopwords.py
58 lines (49 loc) · 3.48 KB
/
string_remove_stopwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# from the Wikipedia:
# In computing, stop words are words which are filtered out before or after processing
# of natural language data (text). Though "stop words" usually refers to the most common words in a language,
# there is no single universal list of stop words used by all natural language processing tools,
# and indeed not all tools even use such a list.
# Some tools specifically avoid removing these stop words to support phrase search.
import nltk
from nltk.corpus import stopwords
## get stopwords
german_stop_words = stopwords.words('german')
# german_stop_words contains the following:
# ['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem',
# 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin',
# 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', 'dass', 'daß', 'derselbe',
# 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine',
# 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem',
# 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig',
# 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es', 'etwas', 'euer', 'eure',
# 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten',
# 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'euch', 'im',
# 'in', 'indem', 'ins', 'ist', 'jede', 'jedem', 'jeden', 'jeder', 'jedes', 'jene', 'jenem', 'jenen', 'jener',
# 'jenes', 'jetzt', 'kann', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'können', 'könnte', 'machen',
# 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mein', 'meine', 'meinem', 'meinen', 'meiner',
# 'meines', 'mit', 'muss', 'musste', 'nach', 'nicht', 'nichts', 'noch', 'nun', 'nur', 'ob', 'oder', 'ohne', 'sehr',
# 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'selbst', 'sich', 'sie', 'ihnen', 'sind', 'so', 'solche',
# 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollte', 'sondern', 'sonst', 'über', 'um', 'und', 'uns',
# 'unsere', 'unserem', 'unseren', 'unser', 'unseres', 'unter', 'viel', 'vom', 'von', 'vor', 'während', 'war',
# 'waren', 'warst', 'was', 'weg', 'weil', 'weiter', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wenn',
# 'werde', 'werden', 'wie', 'wieder', 'will', 'wir', 'wird', 'wirst', 'wo', 'wollen', 'wollte', 'würde', 'würden',
# 'zu', 'zum', 'zur', 'zwar', 'zwischen']
## make it a set to be faster
german_stop_set = set(german_stop_words)
## remove some needed stopwords from set
needed_words = ['mit', 'ohne', 'kein', 'extra', 'nicht']
german_stop_set = [word for word in german_stop_set if word not in needed_words]
## read screen input
string_input = input("your input please: ")
## tokenize string
string_tokenised = nltk.word_tokenize(string_input)
## remove stopwords
# iterate through list of tokenised string
# for word in string_tokenised:
# # check for stopword
# if word in german_stop_set:
# # remove stopword from list of tokenised string
# string_tokenised.remove(word)
# shortened above according to https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
string_tokenised = [word for word in string_tokenised if word not in german_stop_set]
print (string_tokenised)