-
Notifications
You must be signed in to change notification settings - Fork 5
/
gender_stats.py
125 lines (99 loc) · 4.68 KB
/
gender_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
__author__ = 'Megan Ruthven'
import re
from nltk.corpus import stopwords
import pandas as pd
import json
import numpy as np
print "Lets look at those datas";
dataPath = 'your_path';
stop = stopwords.words('english')
with open('config.json') as in_json:
data = json.load(in_json);
male = [word.lower().strip() for word in data['male']];
mset = set(male);
female = [word.lower().strip() for word in data['female']];
fset = set(female);
colNames = ('name', 'male_count', 'male_median', 'female_count', 'female_median', 'both_count', 'both_median', 'none_count', 'none_median', 'total_posts', 'total_median', 'gendered_posts', 'gendered_median');
def agg_groups(group, name):
t = dict.fromkeys(colNames, 0)
t['name'] = name;
t['male_count'] = group['male'].sum();
t['female_count'] = group['female'].sum();
t['both_count'] = group['both'].sum();
t['none_count'] = group['none'].sum();
t['total_posts'] = group['type'].count();
t['gendered_posts'] = t['male_count'] + t['female_count'] - t['both_count']
t['male_median'] = group[group['male'] == 1]['like_count'].median();
t['female_median'] = group[group['female'] == 1]['like_count'].median();
t['none_median'] = group[group['none'] == 1]['like_count'].median();
t['both_median'] = group[group['both'] == 1]['like_count'].median();
t['gendered_median'] = group[group['none'] == 0]['like_count'].median();
t['total_median'] = group['like_count'].median();
return t;
runName = 'posts_'
reader = pd.read_csv(dataPath, encoding='utf-8')
reader = reader[reader.group_name != 'Ladies Storm Hackathons'].reset_index();
#reader = reader[reader.group_name == 'Hackathon Hackers'].reset_index();
likes = reader[reader.type == 'like'];
comments = reader[reader.type != 'like'];
comments = comments[comments['parent_type'] == 'group']
comments['norm_message'] = [re.sub('[^0-9a-zA-Z\x20]+', ' ', re.sub('\'+', '', row.encode('latin-1','ignore'))).lower().strip() if isinstance(row, unicode) else '' for row in comments.message]
comments['date'] = pd.to_datetime(comments['created_time'])
comments = comments.sort(['date'])
# only using posts from when HH became somewhat active
beginning = pd.to_datetime("2014-07-01")
comments = comments[comments['date'].notnull()]
comments = comments[comments['date'] >= beginning]
comments.index=comments['date']
comments.index = comments.index.tz_localize('UTC').tz_convert('US/Central')
# determining gender membership
comments['male'] = [1 if mset.intersection(row.split()) else 0 for row in comments['norm_message']]
comments['female'] = [1 if fset.intersection(row.split()) else 0 for row in comments['norm_message']]
comments['both'] = [1 if row['male'] == 1 and row['female'] == 1 else 0 for (i, row) in comments.iterrows()]
comments['none'] = [0 if row['male'] == 1 or row['female'] == 1 else 1 for (i, row) in comments.iterrows()]
# start of analysis on different categories in the data
grouped = pd.groupby(comments,by=[comments.index.year,comments.index.month])
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
if name[0] >= 2015 or (name[0] == 2014 and name[1] >=7) :
print name
t = agg_groups(group, name)
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'month_year.csv', sep=',');
grouped = pd.groupby(comments,by=[comments.index.dayofweek,comments.index.hour])
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
print name
t = agg_groups(group, name)
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'dow_hour.csv', sep=',');
grouped = pd.groupby(comments,by=[comments.index.year, comments.index.week])
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
if name[0] >= 2015 or (name[0] == 2014 and name[1] >=30) :
print name
t = agg_groups(group, name)
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'week_year.csv', sep=',');
grouped = pd.groupby(comments,by=[comments.index.dayofweek])
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
print name
t = agg_groups(group, name)
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'dow.csv', sep=',');
grouped = pd.groupby(comments,by=[comments.index.hour])
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
print name
t = agg_groups(group, name)
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'hour.csv', sep=',');
grouped = comments.groupby('group_name');
res = pd.DataFrame(columns=colNames)
for name, group in grouped:
print name.encode('latin-1','ignore')
t = agg_groups(group, name.encode('latin-1','ignore'))
res = res.append(t, ignore_index = True)
res.to_csv(runName + 'group.csv', sep=',');
print "fin"