-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelper_functions.py
88 lines (75 loc) · 2.71 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# importing relevant libraries
import numpy as np
import pandas as pd
import pickle
# NLP libraries
import re
import string
import nltk
from sklearn.feature_extraction import text
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
## functions used in data_cleaning.ipynb
# tweet cleaning function
def clean_text_round1(text):
'''Make text lowercase, remove punctuation, mentions, hashtags and words containing numbers.'''
# make text lowercase
text = text.lower()
# removing text within brackets
text = re.sub('\[.*?\]', '', text)
# removing text within parentheses
text = re.sub('\(.*?\)', '', text)
# removing numbers
text = re.sub('\w*\d\w*', '', text)
# if there's more than 1 whitespace, then make it just 1
text = re.sub('\s+', ' ', text)
# if there's a new line, then make it a whitespace
text = re.sub('\n', ' ', text)
# removing any quotes
text = re.sub('\"+', '', text)
# removing &
text = re.sub('(\&\;)', '', text)
# removing any usernames
text = re.sub('(@[^\s]+)', '', text)
# removing any hashtags
text = re.sub('(#[^\s]+)', '', text)
# remove `rt` for retweet
text = re.sub('(rt)', '', text)
# string.punctuation is a string of all punctuation marks
# so this gets rid of all punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
# getting rid of `httptco`
text = re.sub('(httptco)', '', text)
return text
## functions used in nlp_preprocessing.ipynb
def unfiltered_tokens(text):
"""tokenizing without removing stop words"""
dirty_tokens = nltk.word_tokenize(text)
return dirty_tokens
# tokenizing and removing stop words
def process_tweet(text):
"""tokenize text in each column and remove stop words"""
stop_words = set(stopwords.words('english'))
tokens = nltk.word_tokenize(text)
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
return stopwords_removed
#### list to string
def listToString(s):
# initialize an empty string
string = " "
# return string
return (string.join(s))
##### function for lemetization
def lemmatization(processdata) :
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for listy in processdata:
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
lemmatized_output.append(lemmed)
return lemmatized_output
## functions used for modeling process