-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessing_original_question.py
67 lines (45 loc) · 2.81 KB
/
processing_original_question.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv')
stop_words = set(stopwords.words('english'))
def remove_stopwords(words):
wx = [w for w in words if not w in stop_words] ## Removing Stopwords
return wx
def remove_duplicates(my_list):
return list(set(my_list))
def standardize_text(df, text_field):
df[text_field] = df[text_field].str.lower()
df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs
df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem)))
df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem)))
return df
def from_questions_csv():
data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv')
clean_questions = standardize_text(data, "Question")
# print(clean_questions.head())
tokenizer = RegexpTokenizer(r'\w+')
clean_questions["tokens"] = clean_questions["Question"].apply(tokenizer.tokenize) #Tokenization
clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_total.csv')
clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_duplicates.csv')
clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_stopwords.csv')
print(clean_questions.head())
def from_keywords_filters_csv():
clean_questions = standardize_text(data, "Entity")
clean_questions = standardize_text(data, "Filters")
# print(clean_questions.head())
tokenizer = RegexpTokenizer(r'\w+')
clean_questions["tokens"] = clean_questions["Filters"].apply(tokenizer.tokenize) # Tokenization
clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_filters.csv')
print(clean_questions.head())