-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocess.py
95 lines (70 loc) · 3.85 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import re
import codecs
import pandas as pd
import requests, zipfile, io
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
def download_dataset(url):
print('Downloading The Blog Authorship Corpus...')
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(os.getcwd() + '/Dataset')
print('Dataset downloaded & saved in {}/Dataset'.format(os.getcwd()))
# Filters the text inside the post tag in xml files and extracts the labels out of xml file name for Short text based classification.
# Processes the post text, removes stopwords, returns a pandas DataFrame containing label, text, gender, age, zodiac.
def process_data_short_text(folder_path):
print('Processing data for short text...')
df = pd.DataFrame(columns=['label', 'text', 'gender', 'age', 'zodiac']) # Empty DataFrame
for filename in os.listdir(folder_path):
# List of labels processed from file name.
labels = filename.rstrip('.xml').lstrip('0123456789.').lower().split('.')
# Beautiful soup to parse the xml files
blog = BeautifulSoup(codecs.open(folder_path + '/' + filename, encoding='utf-8', errors='ignore'), "lxml")
# Finds <post> tags inside xml
for post in blog.find_all('post'):
# Fetches text inside <post> tags
post_text = post.text
post_text = re.sub('[^A-Za-z]+', ' ', post_text).strip().lower().split()
stop_words = set(stopwords.words('english')) # Stopwords to remove from post text.
post_text = [word for word in post_text if word not in stop_words]
post_text = ' '.join(post_text) # Converts the list back to string.
df = df.append({'label': labels[2], 'text': post_text, 'gender': labels[0], 'age': labels[1], 'zodiac': labels[3]}, ignore_index=True)
# Write DataFrame to csv
df.to_csv('blogdata_short_text.csv')
return print('Data processed & saved as {}/blogdata_short_text.csv'.format(os.getcwd()))
# Filters the text inside the post tag in xml files and extracts the labels out of xml file name for long text based classification.
# Processes the post text, removes stopwords, returns a pandas DataFrame containing label, text, gender, age, zodiac.
def process_data_long_text(folder_path):
print('Processing data for long text...')
# Empty DataFrame
df = pd.DataFrame(columns=['label', 'text', 'gender', 'age', 'zodiac'])
# Unique File Names
for f in os.listdir(folder_path):
ds_gender = f.split('.')[1].lower()
ds_age = f.split('.')[2]
ds_label = f.split('.')[3].lower()
ds_zodiac = f.split('.')[4].lower()
blog_file = BeautifulSoup(codecs.open(folder_path + '/' + f, encoding='utf-8', errors='ignore'), "lxml")
pk = ''
for post in blog_file.find_all('post'):
pk = pk + post.text
post_text = re.sub('[^A-Za-z]+', ' ', pk).strip().lower().split()
stop_words = set(stopwords.words('english')) # Stopwords to remove from post text.
post_text = [word for word in post_text if word not in stop_words]
post_text = ' '.join(post_text) # Converts the list back to string.
df = df.append({'label': ds_label, 'text': post_text, 'gender': ds_gender, 'age': ds_age, 'zodiac': ds_zodiac},ignore_index=True)
# Save DataFrame
df.to_csv('blogdata_long_text.csv')
return print('Data processed & saved as {}/blogdata_long_text.csv'.format(os.getcwd()))
if __name__ == "__main__":
# The Blog Authorship Corpus data set url
url = 'http://www.cs.biu.ac.il/~koppel/blogs/blogs.zip'
# Downloads the data set and saves it locally
download_dataset(url)
# Folder containing The Blog Authorship Corpus
folder_path = os.getcwd() + '/Dataset/blogs'
# process_data_short_text(folder_path)
process_data_long_text(folder_path)