Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ExploratoryDataAnalysis.ipynb #8

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 44 additions & 213 deletions ExploratoryDataAnalysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -292,17 +292,17 @@
"metadata": {},
"outputs": [],
"source": [
"def makehist(datainput, label, color):\n",
"# def makehist(datainput, label, color):\n",
" fig = plt.figure(figsize=(16,4))\n",
" mean = datainput.mean(axis = 0) #changeoutcomevar\n",
" plt.hist(datainput, bins=(20), align='mid', color=color, alpha=0.5)\n",
" plt.hist(data['Variable'], bins=(20), align='mid', color='green', alpha=0.5)\n",
" plt.axvline(x=mean, color=color, linestyle='-')\n",
" plt.xlabel(label)\n",
" plt.xlabel('Variable')\n",
" plt.ylabel('Frequency')\n",
" plt.title((label + ' Histogram'))\n",
" plt.tight_layout()\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + label))"
" plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + 'Variable'))"
]
},
{
Expand All @@ -311,7 +311,7 @@
"metadata": {},
"outputs": [],
"source": [
"makehist(data['Variable'], 'Variable', 'green')\n",
"# makehist(data['Variable'], 'Variable', 'green')\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -328,12 +328,12 @@
"metadata": {},
"outputs": [],
"source": [
"def makebox(datainput, label):\n",
"# def makebox(datainput, label):\n",
" fig = plt.figure(figsize =(16, 4))\n",
" plt.boxplot(datainput)\n",
" plt.title((label + ' Box Plot'))\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + label))"
" plt.boxplot(data['Variable'])\n",
" plt.title(('Variable' + ' Box Plot'))\n",
" plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + 'Variable'))"
]
},
{
Expand All @@ -342,7 +342,7 @@
"metadata": {},
"outputs": [],
"source": [
"makebox(data['Variable'], 'Variable')\n",
"# makebox(data['Variable'], 'Variable')\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -359,12 +359,12 @@
"metadata": {},
"outputs": [],
"source": [
"def makeleaf(datainput, label):\n",
"# def makeleaf(datainput, label):\n",
" fig = plt.figure(figsize =(16, 4))\n",
" plt.stem(datainput)\n",
" plt.title((label + ' Leaf Plot'))\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + label))"
" plt.stem(data['Variable'])\n",
" plt.title(('Variable' + ' Leaf Plot'))\n",
" plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + 'Variable'))"
]
},
{
Expand All @@ -373,7 +373,7 @@
"metadata": {},
"outputs": [],
"source": [
"makeleaf(data['Variable'], 'Variable')\n",
"# makeleaf(data['Variable'], 'Variable')\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -390,14 +390,14 @@
"metadata": {},
"outputs": [],
"source": [
"def makebubble(x, y, s, label):\n",
"# def makebubble(x, y, s, label):\n",
" fig = plt.figure(figsize =(16, 4))\n",
" plt.scatter(x=x, y=y, s=s)\n",
" plt.title((label + ' Bubble plot'))\n",
" plt.scatter(x=data['Variable 1'], y=data['Variable 2'], s=data['Size Variable'])\n",
" plt.title(('Variable 1 vs Variable 2' + ' Bubble plot'))\n",
" plt.xlabel(XLabel) #Put desired name for x Axis here\n",
" plt.ylabel(YLabel) #Put desired name for y Axis here\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + label))\n",
" plt.savefig((filesource + 'Variable 1 vs Variable 2' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + 'Variable 1 vs Variable 2'))\n",
" \n",
"# fig = px.scatter(df.query(\"\"), x=\"statistics\", y = \"Medical Methods\", size = \"pop\", color=\"corr.columns\") # need to change for color's names"
]
Expand All @@ -410,7 +410,7 @@
},
"outputs": [],
"source": [
"makebubble(data['Variable 1'], data['Variable 2'], data['Size Variable'], 'Variable 1 vs Variable 2')\n",
"# makebubble(data['Variable 1'], data['Variable 2'], data['Size Variable'], 'Variable 1 vs Variable 2')\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -427,13 +427,13 @@
"metadata": {},
"outputs": [],
"source": [
"def makerun(xAxis, yAxis, label):\n",
" fig = plt.plot(xAxis, yAxis)\n",
" plt.title((label + ' Run Chart'))\n",
"# def makerun(xAxis, yAxis, label):\n",
" fig = plt.plot(data['Variable'], data['Variable'])\n",
" plt.title((data['Variable'] + ' Run Chart'))\n",
" plt.xlabel(XLabel) #Put desired name for x Axis here\n",
" plt.ylabel(YLabel) #Put desired name for y Axis here\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved Run Chart of ' + label))"
" plt.savefig((filesource + data['Variable'] + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved Run Chart of ' + data['Variable']))"
]
},
{
Expand All @@ -444,7 +444,7 @@
},
"outputs": [],
"source": [
"makerun(data['Variable'], data['Variable'], 'Variable')\n",
"# makerun(data['Variable'], data['Variable'], data['Variable'])\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -461,13 +461,13 @@
"metadata": {},
"outputs": [],
"source": [
"def makemultivariate(var1, var2, label):\n",
" fig = plt.plot(var1, var2)\n",
" plt.title((label + ' Multivariate Chart'))\n",
"# def makemultivariate(var1, var2, label):\n",
" fig = plt.plot(data['Variable'], data['Variable'])\n",
" plt.title(('Variable' + ' Multivariate Chart'))\n",
" plt.xlabel(XLabel) #Put desired name for X Axis here\n",
" plt.ylabel(YLabel) #Put desired name for Y Axis here\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved Run Chart of ' + label))"
" plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved Run Chart of ' + 'Variable'))"
]
},
{
Expand All @@ -476,7 +476,7 @@
"metadata": {},
"outputs": [],
"source": [
"make_multivariate(data['Variable'], data['Variable'], 'Variable')\n",
"# make_multivariate(data['Variable'], data['Variable'], 'Variable')\n",
"#Repeat above command for each numeric Variable in data"
]
},
Expand All @@ -493,14 +493,14 @@
"metadata": {},
"outputs": [],
"source": [
"def makescatter(x, y, label):\n",
"# def makescatter(x, y, label):\n",
" fig = plt.figure(figsize=(16,4))\n",
" plt.scatter(x=x, y=y)\n",
" plt.title((label + ' Scatter plot'))\n",
" plt.scatter(x=data['Variable'], y=data['Variable'])\n",
" plt.title(('Variable' + ' Scatter plot'))\n",
" plt.xlabel(XLabel) #Put desired name for X Axis here\n",
" plt.ylabel(YLabel) #Put desired name for Y Axis here\n",
" plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + label))"
" plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n",
" print(('Saved plot of ' + 'Variable'))"
]
},
{
Expand All @@ -509,15 +509,16 @@
"metadata": {},
"outputs": [],
"source": [
"makeScatterplot(data['Variable'], data['Variable'], 'Variable')\n",
"#makeScatterplot(data['Variable'], data['Variable'], 'Variable')\n",
"#Repeat above command for each numeric Variable in data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Examples of Visualizations"
"## Examples of Visualizations\n",
"If you want to try these examples of visualizations, you need to uncomment all the previous functions like 'makebox', 'makeleaf', 'makebubble'"
]
},
{
Expand Down Expand Up @@ -616,13 +617,6 @@
"makescatter(data['ECG'], data['Apple Watch'], 'ECG Data vs Apple Watch')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Unsupervised Learning Section- "
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -724,169 +718,6 @@
"plt.figure(figsize=(10,7))\n",
"plt.scatter(dfc[:,0], dfc[:,1], c=cluster.labels_, cmap='rainbow') #plot for hieractical clustering"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PCA (Prinicipal Component Analysis)\n",
"<a id=\"pca\"></a>\n",
"https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/\n",
"\n",
"https://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html\n",
"\n",
"PCA selects the successive components that explain the maximum variance in the signal.\n",
"This is useful to us because we have a large amount of features. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Need to scale prior to doing PCA\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"sc = StandardScaler()\n",
"sdfc = sc.fit_transform(dfc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from sklearn import decomposition\n",
"pca = decomposition.PCA()\n",
"pca.fit(sdfc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(pca.explained_variance_ratio_)\n",
"print(pca.singular_values_) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pca.n_components = 9\n",
"pc = pca.fit(dfc)\n",
"\n",
"result=pd.DataFrame(pca.transform(dfc), columns=['PCA%i' % i for i in range(9)], index=dfc.index)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pcft = pca.fit_transform(dfc)\n",
"\n",
"pc_df = pd.DataFrame(data=pcft, columns= ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])\n",
"\n",
"#Example below:\n",
"#pc_df['Cluster'] = data['Definition']\n",
"#pc_df['Status'] = data['Status']\n",
"#pc_df['Gender'] = data['Gender']\n",
"pc_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
"import seaborn as sns\n",
"dfvar = pd.DataFrame({'var':pca.explained_variance_ratio_,\n",
" 'PC':['PC1','PC2','PC3','PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9']})\n",
"sns.barplot(x='PC',y=\"var\", \n",
" data=dfvar, color=\"c\");\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install matplotlib widget Ipython magic: https://github.com/matplotlib/jupyter-matplotlib\n",
"\n",
"Problems with matplotlib widget not working: https://github.com/matplotlib/jupyter-matplotlib/issues/66"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib widget \n",
"\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import axes3d, Axes3D #<-- Note the capitalization! \n",
"\n",
"\n",
"pc_df[insertvarhere]=pd.Categorical(pc_df[insertvarhere]) #need to change insertvarhere\n",
"my_color=pc_df[insertvarhere].cat.codes #need to change insertvarhere\n",
"\n",
"# Plot initialisation\n",
"fig = plt.figure()\n",
"ax = Axes3D(fig) \n",
"ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap='Accent', s=60)\n",
"\n",
"#make simple, bare axis lines through space:\n",
"xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))\n",
"ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')\n",
"yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))\n",
"ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')\n",
"zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))\n",
"ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')\n",
" \n",
"# label the axes\n",
"ax.set_xlabel(\"PC1\")\n",
"ax.set_ylabel(\"PC2\")\n",
"ax.set_zlabel(\"PC3\")\n",
"ax.set_title(\"PCA\")\n",
"#ax.legend()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sns.lmplot( x=\"PC1\", y=\"PC5\",\n",
" data=pc_df, \n",
" fit_reg=False, \n",
" hue=Variable, # color by change variable here\n",
" legend=True,\n",
" scatter_kws={\"s\": 80,'alpha':0.3}) # specify the point size"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading