Skip to content

Commit 688e033

Browse files
victorhauvibaronet2
and
baronet2
authored
EDA figures (#25)
* eda * plots * womens data * womens data * eda * eda * eda figures * eda figures * eda figures * eda figures * eda figures * womens data * removed womens data * removed womens data * Revert eval notebook --------- Co-authored-by: baronet2 <[email protected]>
1 parent 9cbe4ee commit 688e033

5 files changed

+534
-0
lines changed

eda.ipynb

+457
Large diffs are not rendered by default.

eda.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import plotnine
2+
from plotnine import *
3+
4+
def climbers_above_replacement_level(df, color_fill, filename):
5+
attempt_counts = df['Name'].value_counts()
6+
climbers_above = [sum(attempt_counts > level) for level in REPLACEMENT_LEVELS]
7+
plot_df = pd.DataFrame({
8+
'Replacement Level': REPLACEMENT_LEVELS,
9+
'Climbers Above': climbers_above})
10+
plot_df['Replacement Level'] = plot_df['Replacement Level'].astype('category')
11+
12+
plotnine.options.figure_size = (14,6)
13+
14+
p = (
15+
ggplot(plot_df, aes(x='Replacement Level', y='Climbers Above'))
16+
+ geom_bar(stat='identity', fill=color_fill, alpha=0.7, width = 0.9)
17+
+ geom_text(aes(label='Climbers Above'), va='bottom', size=10)
18+
+ labs(x='Replacement Level $N$ (# of Problems Attempted)',
19+
y='# Climbers Above Replacement Level')
20+
+ theme_bw()
21+
+ theme(axis_title=element_text(size=16),
22+
axis_text=element_text(size=12))
23+
+ scale_x_discrete())
24+
p.save(f'results/eda/{filename}.png', dpi=1000)
25+
print(f"Saved file {filename}.png")
26+
27+
return p
28+
29+
30+
def height_histogram(df, color_fill, filename):
31+
plotnine.options.figure_size = (12,6)
32+
33+
p = (ggplot(df, aes(x='Height'))
34+
+ geom_histogram(binwidth=2, fill=color_fill, alpha = 0.7)
35+
+ labs(x='Height (cm)', y='# of Climbers')
36+
+ theme_bw()
37+
+ scale_x_continuous(breaks=range(160, 191, 2))
38+
+ scale_y_continuous(breaks=range(0, 15, 3))
39+
+ theme(axis_title=element_text(size=24),
40+
axis_text=element_text(size=18)))
41+
42+
p.save(f'results/eda/{filename}.png', dpi=1000)
43+
print(f"Saved file {filename}.png")
44+
45+
return p
46+
47+
def problem_attempts(df, color_fill, filename):
48+
problem_id_counts = df['Problem_ID'].value_counts().reset_index()
49+
problem_id_counts.columns = ['Problem_ID', 'Frequency']
50+
51+
plotnine.options.figure_size = (16,6)
52+
p = (
53+
ggplot(problem_id_counts, aes(x='Frequency'))
54+
+ geom_histogram(binwidth=5, fill=color_fill, alpha=0.8)
55+
+ labs(x='# of Climbers Attempted',
56+
y='# of Problems')
57+
+ theme_bw()
58+
+ theme(
59+
axis_title=element_text(size=24),
60+
axis_text=element_text(size=18))
61+
+ scale_x_continuous(breaks=range(0, 167, 10))
62+
+ scale_y_continuous(breaks=range(0, 900, 60)))
63+
64+
p.save(f'results/eda/{filename}.png', dpi=1000)
65+
print(f"Saved file {filename}.png")
66+
67+
return p
68+
69+
if __name__ == "__main__":
70+
import pandas as pd
71+
df = pd.read_csv('data/men_data.csv')
72+
heights = pd.read_csv('data/climbers_heights.csv', index_col = 0)
73+
REPLACEMENT_LEVELS = [25, 50, 100, 250, 500, 1000]
74+
75+
climbers_above_replacement_level(df,'#00abff', 'climbers_above_replacement_level')
76+
height_histogram(heights,'#00abff', 'height_histogram')
77+
problem_attempts(df, '#00abff', 'problem_id_frequency_histogram')
Loading

results/eda/height_histogram.png

510 KB
Loading
851 KB
Loading

0 commit comments

Comments
 (0)