-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessing.py
67 lines (54 loc) · 2.04 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import string
from collections import OrderedDict
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
class Preprocessing:
punctuation_list = string.punctuation
match_domain = re.compile(
r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?')
stopwords = open("stopword-list.txt", "r").read().split("\n")
@staticmethod
def cleaning(source):
source_to_process = source.strip()
source_to_process = Preprocessing.match_domain.sub('', source_to_process)
result = ''
source_last_char_index = len(source_to_process) - 1
for i, char in enumerate(source_to_process):
if char in Preprocessing.punctuation_list or char.isnumeric():
if i != source_last_char_index:
next_char = source_to_process[i + 1]
if (next_char not in Preprocessing.punctuation_list and
not next_char.isnumeric() and next_char != ' '):
result += ' '
else:
result += char
return result
@staticmethod
def case_folding(source):
return source.lower()
@staticmethod
def tokenisasi(source):
return source.split()
@staticmethod
def filtering(source):
return [word for word in source if word not in Preprocessing.stopwords]
@staticmethod
def stemming(source):
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return [stemmer.stem(word) for word in source if word and word.isascii()]
@staticmethod
def type(source):
return list(OrderedDict((word, None) for word in source).keys())
@staticmethod
def preprocess(source):
return Preprocessing.stemming(
Preprocessing.filtering(
Preprocessing.tokenisasi(
Preprocessing.case_folding(
Preprocessing.cleaning(source)
)
)
)
)