Skip to content
This repository was archived by the owner on Jan 24, 2025. It is now read-only.

Commit cc37c52

Browse files
committed
update the threshold
1 parent d5e288c commit cc37c52

9 files changed

+880
-2
lines changed

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ target-corpus/pulpmagazinearchive/
66
.DS_Store
77
.ipynb_checkpoints
88
.ipynb_checkpoints/
9-
.ipynb_checkpoints
10-
.ipynb_checkpoints
119
__pycache__
10+
full_source_no_suspected.csv
11+
full_source_suspected.csv
12+

code/06-threshold/exploratory-analysis.ipynb

+668
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,317,35751,846113,6203183,20360149,39476161,54983685,61646686,58633036,47254712,31625476,16822483,6459778,1483572,149968,5940,313
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,0,0,0,14,303,3900,31851,175068,671045,1912299,4290884,7899959,12460190,17395925,22080236,26053401,28930284,30611385,31035301,30274461,28358575,25439994,21814718,17830495,13794981,10041877,6780606,4173488,2286290,1072397,411175,122215,27753,5066,874,160,153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,0,0,0,0,0,0,0,0,0,2,12,55,248,932,2968,8833,23018,55387,119681,238085,432960,736908,1175391,1769954,2520930,3431337,4468622,5621326,6838864,8083796,9312129,10497510,11582726,12591089,13462312,14180184,14750100,15176992,15434393,15532945,15502356,15303263,14971198,14484462,13874113,13131475,12308519,11388724,10425994,9423513,8406982,7392696,6402285,5465863,4576014,3756501,3024105,2369360,1804128,1333724,952566,648860,423537,259923,151252,81324,40891,19229,8524,3619,1447,593,281,103,57,8,145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,237,27190,596277,4631724,16301929,32732413,45408459,49669670,45779086,35735757,23171208,11957926,4528064,1077959,127228,6949,514
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,0,0,0,6,231,3308,23882,124643,471634,1387953,3243771,6214817,10087112,14359450,18372963,21613822,23794637,24843838,24825832,23834232,21944854,19374340,16361417,13147103,10024105,7177537,4780389,2922670,1605394,767881,310078,100698,26530,5768,1181,195,319
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0,0,0,0,0,0,0,0,0,0,0,0,0,6,30,201,805,2503,6814,17068,39656,84987,166202,305432,528014,859939,1321518,1922253,2674147,3540670,4523517,5563595,6647221,7712229,8720654,9652309,10463003,11150819,11695399,12099238,12361846,12481992,12473808,12352024,12098399,11735833,11257465,10687389,10042846,9331494,8576931,7784486,6974621,6172482,5390772,4633333,3924099,3253438,2656197,2124192,1660148,1262522,934479,670915,462022,305859,193451,116627,65411,35287,17826,8704,3908,1860,816,365,130,65,19,300

code/06-threshold/threshold.ipynb

+203
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import numpy as np\n",
10+
"import pandas as pd\n",
11+
"import matplotlib.pyplot as plt\n",
12+
"import seaborn as sns\n",
13+
"from scipy import stats\n",
14+
"from numpy import genfromtxt"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": null,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"# numpy way of reading csv\n",
24+
"# i got bored waiting to time this\n",
25+
"# it is much faster to read into pandas and cast into numpy\n",
26+
"# with df.values\n",
27+
"test1 = genfromtxt('full_source_no_suspected.csv', delimiter=',', dtype=\"float64\")"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": null,
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"type(test1)"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"# pandas way of reading csv\n",
46+
"# %time:\n",
47+
"# CPU times: user 2min 16s, sys: 3.56 s, total: 2min 19s\n",
48+
"# Wall time: 2min 25s\n",
49+
"\n",
50+
"test2 = pd.read_csv(\"full_source_no_suspected.csv\", header=None, dtype=\"float64\")"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 6,
56+
"metadata": {},
57+
"outputs": [
58+
{
59+
"name": "stdout",
60+
"output_type": "stream",
61+
"text": [
62+
"CPU times: user 9 µs, sys: 0 ns, total: 9 µs\n",
63+
"Wall time: 17.9 µs\n"
64+
]
65+
}
66+
],
67+
"source": [
68+
"# read in chunks for files significantly larger than memory\n",
69+
"# using pandas data frame\n",
70+
"# not working at the moment\n",
71+
"test3 = pd.DataFrame()\n",
72+
"for chunk in pd.read_csv(\"full_source_no_suspected.csv\", header=None, dtype=\"float64\", chunksize=10000):\n",
73+
" pd.concat([test3,chunk])"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 9,
79+
"metadata": {},
80+
"outputs": [
81+
{
82+
"data": {
83+
"text/html": [
84+
"<div>\n",
85+
"<style scoped>\n",
86+
" .dataframe tbody tr th:only-of-type {\n",
87+
" vertical-align: middle;\n",
88+
" }\n",
89+
"\n",
90+
" .dataframe tbody tr th {\n",
91+
" vertical-align: top;\n",
92+
" }\n",
93+
"\n",
94+
" .dataframe thead th {\n",
95+
" text-align: right;\n",
96+
" }\n",
97+
"</style>\n",
98+
"<table border=\"1\" class=\"dataframe\">\n",
99+
" <thead>\n",
100+
" <tr style=\"text-align: right;\">\n",
101+
" <th></th>\n",
102+
" </tr>\n",
103+
" </thead>\n",
104+
" <tbody>\n",
105+
" </tbody>\n",
106+
"</table>\n",
107+
"</div>"
108+
],
109+
"text/plain": [
110+
"Empty DataFrame\n",
111+
"Columns: []\n",
112+
"Index: []"
113+
]
114+
},
115+
"execution_count": 9,
116+
"metadata": {},
117+
"output_type": "execute_result"
118+
}
119+
],
120+
"source": [
121+
"test3"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": null,
127+
"metadata": {},
128+
"outputs": [],
129+
"source": [
130+
"# pandas_frame.value returns a numpy array\n",
131+
"nondarray = test2.values"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": null,
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"type(nondarray)"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"# return numpy array of x, y indeces for all values greater than\n",
150+
"np.nonzero(nondarray > .9)"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"#for row in nondarray:\n",
160+
"#sns.distplot(nondarray, kde=False, rug=True);\n",
161+
"\n",
162+
"print(len(nondarray), len(nondarray[0]))"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"sns.set(color_codes=False)"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": null,
177+
"metadata": {},
178+
"outputs": [],
179+
"source": []
180+
}
181+
],
182+
"metadata": {
183+
"kernelspec": {
184+
"display_name": "Python 3",
185+
"language": "python",
186+
"name": "python3"
187+
},
188+
"language_info": {
189+
"codemirror_mode": {
190+
"name": "ipython",
191+
"version": 3
192+
},
193+
"file_extension": ".py",
194+
"mimetype": "text/x-python",
195+
"name": "python",
196+
"nbconvert_exporter": "python",
197+
"pygments_lexer": "ipython3",
198+
"version": "3.7.3"
199+
}
200+
},
201+
"nbformat": 4,
202+
"nbformat_minor": 2
203+
}

0 commit comments

Comments
 (0)