update the threshold

denten · denten · commit cc37c5287bcf · 2019-07-31T11:37:05.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ target-corpus/pulpmagazinearchive/
 .DS_Store
 .ipynb_checkpoints
 .ipynb_checkpoints/
-.ipynb_checkpoints
-.ipynb_checkpoints
 __pycache__
+full_source_no_suspected.csv
+full_source_suspected.csv
+
diff --git a/code/06-threshold/exploratory-analysis.ipynb b/code/06-threshold/exploratory-analysis.ipynb
diff --git a/code/06-threshold/histograms/sus_histogram_20.csv b/code/06-threshold/histograms/sus_histogram_20.csv
@@ -0,0 +1 @@
+0,0,0,317,35751,846113,6203183,20360149,39476161,54983685,61646686,58633036,47254712,31625476,16822483,6459778,1483572,149968,5940,313
diff --git a/code/06-threshold/histograms/sus_histogram_40.csv b/code/06-threshold/histograms/sus_histogram_40.csv
@@ -0,0 +1 @@
+0,0,0,0,0,0,14,303,3900,31851,175068,671045,1912299,4290884,7899959,12460190,17395925,22080236,26053401,28930284,30611385,31035301,30274461,28358575,25439994,21814718,17830495,13794981,10041877,6780606,4173488,2286290,1072397,411175,122215,27753,5066,874,160,153
diff --git a/code/06-threshold/histograms/sus_histogram_80.csv b/code/06-threshold/histograms/sus_histogram_80.csv
@@ -0,0 +1 @@
+0,0,0,0,0,0,0,0,0,0,0,0,2,12,55,248,932,2968,8833,23018,55387,119681,238085,432960,736908,1175391,1769954,2520930,3431337,4468622,5621326,6838864,8083796,9312129,10497510,11582726,12591089,13462312,14180184,14750100,15176992,15434393,15532945,15502356,15303263,14971198,14484462,13874113,13131475,12308519,11388724,10425994,9423513,8406982,7392696,6402285,5465863,4576014,3756501,3024105,2369360,1804128,1333724,952566,648860,423537,259923,151252,81324,40891,19229,8524,3619,1447,593,281,103,57,8,145
diff --git a/code/06-threshold/histograms/unsus_histogram_20.csv b/code/06-threshold/histograms/unsus_histogram_20.csv
@@ -0,0 +1 @@
+0,0,0,237,27190,596277,4631724,16301929,32732413,45408459,49669670,45779086,35735757,23171208,11957926,4528064,1077959,127228,6949,514
diff --git a/code/06-threshold/histograms/unsus_histogram_40.csv b/code/06-threshold/histograms/unsus_histogram_40.csv
@@ -0,0 +1 @@
+0,0,0,0,0,0,6,231,3308,23882,124643,471634,1387953,3243771,6214817,10087112,14359450,18372963,21613822,23794637,24843838,24825832,23834232,21944854,19374340,16361417,13147103,10024105,7177537,4780389,2922670,1605394,767881,310078,100698,26530,5768,1181,195,319
diff --git a/code/06-threshold/histograms/unsus_histogram_80.csv b/code/06-threshold/histograms/unsus_histogram_80.csv
@@ -0,0 +1 @@
+0,0,0,0,0,0,0,0,0,0,0,0,0,6,30,201,805,2503,6814,17068,39656,84987,166202,305432,528014,859939,1321518,1922253,2674147,3540670,4523517,5563595,6647221,7712229,8720654,9652309,10463003,11150819,11695399,12099238,12361846,12481992,12473808,12352024,12098399,11735833,11257465,10687389,10042846,9331494,8576931,7784486,6974621,6172482,5390772,4633333,3924099,3253438,2656197,2124192,1660148,1262522,934479,670915,462022,305859,193451,116627,65411,35287,17826,8704,3908,1860,816,365,130,65,19,300
diff --git a/code/06-threshold/threshold.ipynb b/code/06-threshold/threshold.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from scipy import stats\n",
+    "from numpy import genfromtxt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# numpy way of reading csv\n",
+    "# i got bored waiting to time this\n",
+    "# it is much faster to read into pandas and cast into numpy\n",
+    "# with df.values\n",
+    "test1 = genfromtxt('full_source_no_suspected.csv', delimiter=',', dtype=\"float64\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(test1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pandas way of reading csv\n",
+    "# %time:\n",
+    "# CPU times: user 2min 16s, sys: 3.56 s, total: 2min 19s\n",
+    "# Wall time: 2min 25s\n",
+    "\n",
+    "test2 = pd.read_csv(\"full_source_no_suspected.csv\", header=None, dtype=\"float64\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 9 µs, sys: 0 ns, total: 9 µs\n",
+      "Wall time: 17.9 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "# read in chunks for files significantly larger than memory\n",
+    "# using pandas data frame\n",
+    "# not working at the moment\n",
+    "test3 = pd.DataFrame()\n",
+    "for chunk in pd.read_csv(\"full_source_no_suspected.csv\", header=None, dtype=\"float64\", chunksize=10000):\n",
+    "          pd.concat([test3,chunk])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: []\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pandas_frame.value returns a numpy array\n",
+    "nondarray = test2.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(nondarray)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# return numpy array of x, y indeces for all values greater than\n",
+    "np.nonzero(nondarray > .9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#for row in nondarray:\n",
+    "#sns.distplot(nondarray, kde=False, rug=True);\n",
+    "\n",
+    "print(len(nondarray), len(nondarray[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set(color_codes=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0,0,0,317,35751,846113,6203183,20360149,39476161,54983685,61646686,58633036,47254712,31625476,16822483,6459778,1483572,149968,5940,313`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0,0,0,0,0,0,14,303,3900,31851,175068,671045,1912299,4290884,7899959,12460190,17395925,22080236,26053401,28930284,30611385,31035301,30274461,28358575,25439994,21814718,17830495,13794981,10041877,6780606,4173488,2286290,1072397,411175,122215,27753,5066,874,160,153`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+0,0,0,0,0,0,0,0,0,0,0,0,2,12,55,248,932,2968,8833,23018,55387,119681,238085,432960,736908,1175391,1769954,2520930,3431337,4468622,5621326,6838864,8083796,9312129,10497510,11582726,12591089,13462312,14180184,14750100,15176992,15434393,15532945,15502356,15303263,14971198,14484462,13874113,13131475,12308519,11388724,10425994,9423513,8406982,7392696,6402285,5465863,4576014,3756501,3024105,2369360,1804128,1333724,952566,648860,423537,259923,151252,81324,40891,19229,8524,3619,1447,593,281,103,57,8,145
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0,0,0,237,27190,596277,4631724,16301929,32732413,45408459,49669670,45779086,35735757,23171208,11957926,4528064,1077959,127228,6949,514`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0,0,0,0,0,0,6,231,3308,23882,124643,471634,1387953,3243771,6214817,10087112,14359450,18372963,21613822,23794637,24843838,24825832,23834232,21944854,19374340,16361417,13147103,10024105,7177537,4780389,2922670,1605394,767881,310078,100698,26530,5768,1181,195,319`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0,0,0,0,0,0,0,0,0,0,0,0,0,6,30,201,805,2503,6814,17068,39656,84987,166202,305432,528014,859939,1321518,1922253,2674147,3540670,4523517,5563595,6647221,7712229,8720654,9652309,10463003,11150819,11695399,12099238,12361846,12481992,12473808,12352024,12098399,11735833,11257465,10687389,10042846,9331494,8576931,7784486,6974621,6172482,5390772,4633333,3924099,3253438,2656197,2124192,1660148,1262522,934479,670915,462022,305859,193451,116627,65411,35287,17826,8704,3908,1860,816,365,130,65,19,300`