Skip to content

Commit 6a1a2af

Browse files
committedJul 4, 2015
Added to statistical inference section with scipy: hypothesis testing notebook.
1 parent 85675ab commit 6a1a2af

7 files changed

+4680
-0
lines changed
 

‎scipy/2002FemPreg.dat.gz

1020 KB
Binary file not shown.

‎scipy/2002FemPreg.dct

+245
Large diffs are not rendered by default.

‎scipy/first.py

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""This file contains code used in "Think Stats",
2+
by Allen B. Downey, available from greenteapress.com
3+
4+
Copyright 2014 Allen B. Downey
5+
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6+
"""
7+
8+
from __future__ import print_function
9+
10+
import math
11+
import numpy as np
12+
13+
import nsfg
14+
import thinkstats2
15+
import thinkplot
16+
17+
18+
def MakeFrames():
19+
"""Reads pregnancy data and partitions first babies and others.
20+
21+
returns: DataFrames (all live births, first babies, others)
22+
"""
23+
preg = nsfg.ReadFemPreg()
24+
25+
live = preg[preg.outcome == 1]
26+
firsts = live[live.birthord == 1]
27+
others = live[live.birthord != 1]
28+
29+
assert len(live) == 9148
30+
assert len(firsts) == 4413
31+
assert len(others) == 4735
32+
33+
return live, firsts, others
34+
35+
36+
def Summarize(live, firsts, others):
37+
"""Print various summary statistics."""
38+
39+
mean = live.prglngth.mean()
40+
var = live.prglngth.var()
41+
std = live.prglngth.std()
42+
43+
print('Live mean', mean)
44+
print('Live variance', var)
45+
print('Live std', std)
46+
47+
mean1 = firsts.prglngth.mean()
48+
mean2 = others.prglngth.mean()
49+
50+
var1 = firsts.prglngth.var()
51+
var2 = others.prglngth.var()
52+
53+
print('Mean')
54+
print('First babies', mean1)
55+
print('Others', mean2)
56+
57+
print('Variance')
58+
print('First babies', var1)
59+
print('Others', var2)
60+
61+
print('Difference in weeks', mean1 - mean2)
62+
print('Difference in hours', (mean1 - mean2) * 7 * 24)
63+
64+
print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)
65+
66+
d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
67+
print('Cohen d', d)
68+
69+
70+
def PrintExtremes(live):
71+
"""Plots the histogram of pregnancy lengths and prints the extremes.
72+
73+
live: DataFrame of live births
74+
"""
75+
hist = thinkstats2.Hist(live.prglngth)
76+
thinkplot.Hist(hist, label='live births')
77+
78+
thinkplot.Save(root='first_nsfg_hist_live',
79+
title='Histogram',
80+
xlabel='weeks',
81+
ylabel='frequency')
82+
83+
print('Shortest lengths:')
84+
for weeks, freq in hist.Smallest(10):
85+
print(weeks, freq)
86+
87+
print('Longest lengths:')
88+
for weeks, freq in hist.Largest(10):
89+
print(weeks, freq)
90+
91+
92+
def MakeHists(live):
93+
"""Plot Hists for live births
94+
95+
live: DataFrame
96+
others: DataFrame
97+
"""
98+
hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
99+
thinkplot.Hist(hist)
100+
thinkplot.Save(root='first_wgt_lb_hist',
101+
xlabel='pounds',
102+
ylabel='frequency',
103+
axis=[-1, 14, 0, 3200])
104+
105+
hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
106+
thinkplot.Hist(hist)
107+
thinkplot.Save(root='first_wgt_oz_hist',
108+
xlabel='ounces',
109+
ylabel='frequency',
110+
axis=[-1, 16, 0, 1200])
111+
112+
hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
113+
thinkplot.Hist(hist)
114+
thinkplot.Save(root='first_agepreg_hist',
115+
xlabel='years',
116+
ylabel='frequency')
117+
118+
hist = thinkstats2.Hist(live.prglngth, label='prglngth')
119+
thinkplot.Hist(hist)
120+
thinkplot.Save(root='first_prglngth_hist',
121+
xlabel='weeks',
122+
ylabel='frequency',
123+
axis=[-1, 53, 0, 5000])
124+
125+
126+
def MakeComparison(firsts, others):
127+
"""Plots histograms of pregnancy length for first babies and others.
128+
129+
firsts: DataFrame
130+
others: DataFrame
131+
"""
132+
first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
133+
other_hist = thinkstats2.Hist(others.prglngth, label='other')
134+
135+
width = 0.45
136+
thinkplot.PrePlot(2)
137+
thinkplot.Hist(first_hist, align='right', width=width)
138+
thinkplot.Hist(other_hist, align='left', width=width)
139+
140+
thinkplot.Save(root='first_nsfg_hist',
141+
title='Histogram',
142+
xlabel='weeks',
143+
ylabel='frequency',
144+
axis=[27, 46, 0, 2700])
145+
146+
147+
def main(script):
148+
live, firsts, others = MakeFrames()
149+
150+
MakeHists(live)
151+
PrintExtremes(live)
152+
MakeComparison(firsts, others)
153+
Summarize(live, firsts, others)
154+
155+
156+
if __name__ == '__main__':
157+
import sys
158+
main(*sys.argv)
159+
160+

‎scipy/hypothesis.ipynb

+652
Large diffs are not rendered by default.

‎scipy/nsfg.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""This file contains code for use with "Think Stats",
2+
by Allen B. Downey, available from greenteapress.com
3+
4+
Copyright 2010 Allen B. Downey
5+
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6+
"""
7+
8+
from __future__ import print_function
9+
10+
from collections import defaultdict
11+
import numpy as np
12+
import sys
13+
14+
import thinkstats2
15+
16+
17+
def ReadFemPreg(dct_file='2002FemPreg.dct',
18+
dat_file='2002FemPreg.dat.gz'):
19+
"""Reads the NSFG pregnancy data.
20+
21+
dct_file: string file name
22+
dat_file: string file name
23+
24+
returns: DataFrame
25+
"""
26+
dct = thinkstats2.ReadStataDct(dct_file)
27+
df = dct.ReadFixedWidth(dat_file, compression='gzip')
28+
CleanFemPreg(df)
29+
return df
30+
31+
32+
def CleanFemPreg(df):
33+
"""Recodes variables from the pregnancy frame.
34+
35+
df: DataFrame
36+
"""
37+
# mother's age is encoded in centiyears; convert to years
38+
df.agepreg /= 100.0
39+
40+
# birthwgt_lb contains at least one bogus value (51 lbs)
41+
# replace with NaN
42+
df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
43+
44+
# replace 'not ascertained', 'refused', 'don't know' with NaN
45+
na_vals = [97, 98, 99]
46+
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
47+
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
48+
df.hpagelb.replace(na_vals, np.nan, inplace=True)
49+
50+
df.babysex.replace([7, 9], np.nan, inplace=True)
51+
df.nbrnaliv.replace([9], np.nan, inplace=True)
52+
53+
# birthweight is stored in two columns, lbs and oz.
54+
# convert to a single column in lb
55+
# NOTE: creating a new column requires dictionary syntax,
56+
# not attribute assignment (like df.totalwgt_lb)
57+
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
58+
59+
# due to a bug in ReadStataDct, the last variable gets clipped;
60+
# so for now set it to NaN
61+
df.cmintvw = np.nan
62+
63+
64+
def MakePregMap(df):
65+
"""Make a map from caseid to list of preg indices.
66+
67+
df: DataFrame
68+
69+
returns: dict that maps from caseid to list of indices into preg df
70+
"""
71+
d = defaultdict(list)
72+
for index, caseid in df.caseid.iteritems():
73+
d[caseid].append(index)
74+
return d
75+
76+
77+
def main(script):
78+
"""Tests the functions in this module.
79+
80+
script: string script name
81+
"""
82+
df = ReadFemPreg()
83+
print(df.shape)
84+
85+
assert len(df) == 13593
86+
87+
assert df.caseid[13592] == 12571
88+
assert df.pregordr.value_counts()[1] == 5033
89+
assert df.nbrnaliv.value_counts()[1] == 8981
90+
assert df.babysex.value_counts()[1] == 4641
91+
assert df.birthwgt_lb.value_counts()[7] == 3049
92+
assert df.birthwgt_oz.value_counts()[0] == 1037
93+
assert df.prglngth.value_counts()[39] == 4744
94+
assert df.outcome.value_counts()[1] == 9148
95+
assert df.birthord.value_counts()[1] == 4413
96+
assert df.agepreg.value_counts()[22.75] == 100
97+
assert df.totalwgt_lb.value_counts()[7.5] == 302
98+
99+
weights = df.finalwgt.value_counts()
100+
key = max(weights.keys())
101+
assert df.finalwgt.value_counts()[key] == 6
102+
103+
print('%s: All tests passed.' % script)
104+
105+
if __name__ == '__main__':
106+
main(*sys.argv)

‎scipy/thinkplot.py

+716
Large diffs are not rendered by default.

‎scipy/thinkstats2.py

+2,801
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.