-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextPreprocess.py
executable file
·153 lines (123 loc) · 5.06 KB
/
TextPreprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
import pandas as pd
import logging
import itertools
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator
class TextPreprocess(BaseEstimator):
def __init__(self
,preprocs = ['raw']
,max_features = 100000
,max_len = 150
):
self.preprocs = preprocs # @preprocs.setter gets invoked here
self.max_features = max_features
self.max_len = max_len
self.preprocs_lib_fit = { # fit version of the approaches
'raw':self.raw_fit
,'fillna':self._fillna_fit
,'comment_to_lower':self._comment_to_lower_fit
,'text_to_seq':self._text_to_seq_fit
}
self.preprocs_lib_transform = { # fit version of the approaches
'raw': self.raw_transform
,'fillna': self._fillna_transform
,'comment_to_lower': self._comment_to_lower_transform
,'text_to_seq': self._text_to_seq_transform
}
self.preprocs_info = [] # store info about every filtration for later retrieval and investigation
super(TextPreprocess, self).__init__()
def get_pickable(self):
return {
'preprocs': self.preprocs,
'max_features': self.max_features,
'max_len': self.max_len
}
def load_pickable(self, pkl):
self.preprocs = pkl['preprocs']
self.max_features = pkl['max_features']
self.max_len = pkl['max_len']
@property
def preprocs(self):
return self.__preprocs
@preprocs.setter
def preprocs(self,value):
if type(value) is not list:
logging.error('Preprocessing methods should be passed as a list')
for v in value:
if v not in ['raw','fillna','comment_to_lower','text_to_seq']:
logging.error(f'Preprocessing method {v} is not supported')
self.__preprocs = value
def raw_fit(self,df):
pass
def raw_transform(self, df):
return df, {}
def _fillna_fit(self,df):
pass # do nothing
def _fillna_transform(self,df):
df["comment_text"] = df["comment_text"].fillna("no comment")
return df, {'#rows with nan imputed': f'{df["comment_text"].isnull().sum()} out of {len(df)}'}
def _comment_to_lower_fit(self,df):
pass
def _comment_to_lower_transform(self,df):
df["comment_text"] = df["comment_text"].str.lower()
return df, {}
def _text_to_seq_fit(self,train):
"""
Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation
or padding as needed) called word_index.
:param train:
:return:
"""
self.tk = Tokenizer(num_words=self.max_features,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\'\n', # I added \' it was not there by default
lower=True,
split=" ",
char_level=False)
self.tk.fit_on_texts(train["comment_text"])
# word_index - A dictionary of words and their uniquely assigned integers
self.word_index = self.tk.word_index
# v.important the values required for transform after fitting should be made as class variables (self) and not returned using return
# _fit function doesnt return anything but _transform return df and dict (info)
def _text_to_seq_transform(self, df):
"""
Standard keras preprocessing, to convert list of word indexes to equal length using 0 padding.
:param df:
:return:
"""
df["comment_seq"] = self.tk.texts_to_sequences(df["comment_text"])
x_df = pad_sequences(df["comment_seq"], maxlen=self.max_len)
return x_df, {}
def fit(self, x, y=None, **fit_params):
for f in self.preprocs:
self.preprocs_lib_fit[f](x)
def transform(self, x):
for f in self.preprocs: # filters are appplied sequentially as per order given in self.preprocs
x, info = self.preprocs_lib_transform[f](x)
self.preprocs_info.append(info)
return x
def fit_transform(self, x, y=None, **fit_params):
for f in self.preprocs:
self.preprocs_lib_fit[f](x)
x, info = self.preprocs_lib_transform[f](x)
self.preprocs_info.append(info)
return x
def print_info(self):
""" print all the info about applied preprocs """
print("Preprocessing info")
for info in self.preprocs_info:
for k,v in info.items():
print(k, ":", v)
"""
textPreprocess = TextPreprocess(
preprocs = ['raw', 'fillna', 'comment_to_lower', 'text_to_seq']
,max_features = 200000
,max_len = 150
)
textPreprocess.fit(train_df)
train_pre = textPreprocess.transform(train_df)
test_pre = textPreprocess.transform(test_df)
textPreprocess.print_info()
textPreprocess.get_pickable()
"""