From a2b09d0368772a2c6ab3e8b460589f2f2bc63145 Mon Sep 17 00:00:00 2001 From: Christina Yu Date: Fri, 5 Aug 2022 16:03:28 +0800 Subject: [PATCH 1/2] Update ExploratoryDataAnalysis.ipynb final Review --- ExploratoryDataAnalysis.ipynb | 257 ++++++---------------------------- 1 file changed, 44 insertions(+), 213 deletions(-) diff --git a/ExploratoryDataAnalysis.ipynb b/ExploratoryDataAnalysis.ipynb index 24c246c..f10b751 100644 --- a/ExploratoryDataAnalysis.ipynb +++ b/ExploratoryDataAnalysis.ipynb @@ -292,17 +292,17 @@ "metadata": {}, "outputs": [], "source": [ - "def makehist(datainput, label, color):\n", + "# def makehist(datainput, label, color):\n", " fig = plt.figure(figsize=(16,4))\n", " mean = datainput.mean(axis = 0) #changeoutcomevar\n", - " plt.hist(datainput, bins=(20), align='mid', color=color, alpha=0.5)\n", + " plt.hist(data['Variable'], bins=(20), align='mid', color='green', alpha=0.5)\n", " plt.axvline(x=mean, color=color, linestyle='-')\n", - " plt.xlabel(label)\n", + " plt.xlabel('Variable')\n", " plt.ylabel('Frequency')\n", " plt.title((label + ' Histogram'))\n", " plt.tight_layout()\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved plot of ' + label))" + " plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved plot of ' + 'Variable'))" ] }, { @@ -311,7 +311,7 @@ "metadata": {}, "outputs": [], "source": [ - "makehist(data['Variable'], 'Variable', 'green')\n", + "# makehist(data['Variable'], 'Variable', 'green')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -328,12 +328,12 @@ "metadata": {}, "outputs": [], "source": [ - "def makebox(datainput, label):\n", + "# def makebox(datainput, label):\n", " fig = plt.figure(figsize =(16, 4))\n", - " plt.boxplot(datainput)\n", - " plt.title((label + ' Box Plot'))\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved plot of ' + label))" + " plt.boxplot(data['Variable'])\n", + " plt.title(('Variable' + ' Box Plot'))\n", + " plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved plot of ' + 'Variable'))" ] }, { @@ -342,7 +342,7 @@ "metadata": {}, "outputs": [], "source": [ - "makebox(data['Variable'], 'Variable')\n", + "# makebox(data['Variable'], 'Variable')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -359,12 +359,12 @@ "metadata": {}, "outputs": [], "source": [ - "def makeleaf(datainput, label):\n", + "# def makeleaf(datainput, label):\n", " fig = plt.figure(figsize =(16, 4))\n", - " plt.stem(datainput)\n", - " plt.title((label + ' Leaf Plot'))\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved plot of ' + label))" + " plt.stem(data['Variable'])\n", + " plt.title(('Variable' + ' Leaf Plot'))\n", + " plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved plot of ' + 'Variable'))" ] }, { @@ -373,7 +373,7 @@ "metadata": {}, "outputs": [], "source": [ - "makeleaf(data['Variable'], 'Variable')\n", + "# makeleaf(data['Variable'], 'Variable')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -390,14 +390,14 @@ "metadata": {}, "outputs": [], "source": [ - "def makebubble(x, y, s, label):\n", + "# def makebubble(x, y, s, label):\n", " fig = plt.figure(figsize =(16, 4))\n", - " plt.scatter(x=x, y=y, s=s)\n", - " plt.title((label + ' Bubble plot'))\n", + " plt.scatter(x=data['Variable 1'], y=data['Variable 2'], s=data['Size Variable'])\n", + " plt.title(('Variable 1 vs Variable 2' + ' Bubble plot'))\n", " plt.xlabel(XLabel) #Put desired name for x Axis here\n", " plt.ylabel(YLabel) #Put desired name for y Axis here\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved plot of ' + label))\n", + " plt.savefig((filesource + 'Variable 1 vs Variable 2' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved plot of ' + 'Variable 1 vs Variable 2'))\n", " \n", "# fig = px.scatter(df.query(\"\"), x=\"statistics\", y = \"Medical Methods\", size = \"pop\", color=\"corr.columns\") # need to change for color's names" ] @@ -410,7 +410,7 @@ }, "outputs": [], "source": [ - "makebubble(data['Variable 1'], data['Variable 2'], data['Size Variable'], 'Variable 1 vs Variable 2')\n", + "# makebubble(data['Variable 1'], data['Variable 2'], data['Size Variable'], 'Variable 1 vs Variable 2')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -427,13 +427,13 @@ "metadata": {}, "outputs": [], "source": [ - "def makerun(xAxis, yAxis, label):\n", - " fig = plt.plot(xAxis, yAxis)\n", - " plt.title((label + ' Run Chart'))\n", + "# def makerun(xAxis, yAxis, label):\n", + " fig = plt.plot(data['Variable'], data['Variable'])\n", + " plt.title((data['Variable'] + ' Run Chart'))\n", " plt.xlabel(XLabel) #Put desired name for x Axis here\n", " plt.ylabel(YLabel) #Put desired name for y Axis here\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved Run Chart of ' + label))" + " plt.savefig((filesource + data['Variable'] + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved Run Chart of ' + data['Variable']))" ] }, { @@ -444,7 +444,7 @@ }, "outputs": [], "source": [ - "makerun(data['Variable'], data['Variable'], 'Variable')\n", + "# makerun(data['Variable'], data['Variable'], data['Variable'])\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -461,13 +461,13 @@ "metadata": {}, "outputs": [], "source": [ - "def makemultivariate(var1, var2, label):\n", - " fig = plt.plot(var1, var2)\n", - " plt.title((label + ' Multivariate Chart'))\n", + "# def makemultivariate(var1, var2, label):\n", + " fig = plt.plot(data['Variable'], data['Variable'])\n", + " plt.title(('Variable' + ' Multivariate Chart'))\n", " plt.xlabel(XLabel) #Put desired name for X Axis here\n", " plt.ylabel(YLabel) #Put desired name for Y Axis here\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved Run Chart of ' + label))" + " plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved Run Chart of ' + 'Variable'))" ] }, { @@ -476,7 +476,7 @@ "metadata": {}, "outputs": [], "source": [ - "make_multivariate(data['Variable'], data['Variable'], 'Variable')\n", + "# make_multivariate(data['Variable'], data['Variable'], 'Variable')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -493,14 +493,14 @@ "metadata": {}, "outputs": [], "source": [ - "def makescatter(x, y, label):\n", + "# def makescatter(x, y, label):\n", " fig = plt.figure(figsize=(16,4))\n", - " plt.scatter(x=x, y=y)\n", - " plt.title((label + ' Scatter plot'))\n", + " plt.scatter(x=data['Variable'], y=data['Variable'])\n", + " plt.title(('Variable' + ' Scatter plot'))\n", " plt.xlabel(XLabel) #Put desired name for X Axis here\n", " plt.ylabel(YLabel) #Put desired name for Y Axis here\n", - " plt.savefig((filesource + label + '.png'), dpi=100) #change filesource or add as input to function if variable\n", - " print(('Saved plot of ' + label))" + " plt.savefig((filesource + 'Variable' + '.png'), dpi=100) #change filesource or add as input to function if variable\n", + " print(('Saved plot of ' + 'Variable'))" ] }, { @@ -509,7 +509,7 @@ "metadata": {}, "outputs": [], "source": [ - "makeScatterplot(data['Variable'], data['Variable'], 'Variable')\n", + "#makeScatterplot(data['Variable'], data['Variable'], 'Variable')\n", "#Repeat above command for each numeric Variable in data" ] }, @@ -517,7 +517,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Examples of Visualizations" + "## Examples of Visualizations\n", + "If you want to try these examples of visualizations, you need to uncomment all the previous functions like 'makebox', 'makeleaf', 'makebubble'" ] }, { @@ -616,13 +617,6 @@ "makescatter(data['ECG'], data['Apple Watch'], 'ECG Data vs Apple Watch')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Unsupervised Learning Section- " - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -724,169 +718,6 @@ "plt.figure(figsize=(10,7))\n", "plt.scatter(dfc[:,0], dfc[:,1], c=cluster.labels_, cmap='rainbow') #plot for hieractical clustering" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PCA (Prinicipal Component Analysis)\n", - "\n", - "https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/\n", - "\n", - "https://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html\n", - "\n", - "PCA selects the successive components that explain the maximum variance in the signal.\n", - "This is useful to us because we have a large amount of features. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Need to scale prior to doing PCA\n", - "\n", - "from sklearn.preprocessing import StandardScaler\n", - "sc = StandardScaler()\n", - "sdfc = sc.fit_transform(dfc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "from sklearn import decomposition\n", - "pca = decomposition.PCA()\n", - "pca.fit(sdfc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(pca.explained_variance_ratio_)\n", - "print(pca.singular_values_) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pca.n_components = 9\n", - "pc = pca.fit(dfc)\n", - "\n", - "result=pd.DataFrame(pca.transform(dfc), columns=['PCA%i' % i for i in range(9)], index=dfc.index)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pcft = pca.fit_transform(dfc)\n", - "\n", - "pc_df = pd.DataFrame(data=pcft, columns= ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])\n", - "\n", - "#Example below:\n", - "#pc_df['Cluster'] = data['Definition']\n", - "#pc_df['Status'] = data['Status']\n", - "#pc_df['Gender'] = data['Gender']\n", - "pc_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import seaborn as sns\n", - "dfvar = pd.DataFrame({'var':pca.explained_variance_ratio_,\n", - " 'PC':['PC1','PC2','PC3','PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9']})\n", - "sns.barplot(x='PC',y=\"var\", \n", - " data=dfvar, color=\"c\");\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install matplotlib widget Ipython magic: https://github.com/matplotlib/jupyter-matplotlib\n", - "\n", - "Problems with matplotlib widget not working: https://github.com/matplotlib/jupyter-matplotlib/issues/66" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib widget \n", - "\n", - "import matplotlib.pyplot as plt\n", - "from mpl_toolkits.mplot3d import axes3d, Axes3D #<-- Note the capitalization! \n", - "\n", - "\n", - "pc_df[insertvarhere]=pd.Categorical(pc_df[insertvarhere]) #need to change insertvarhere\n", - "my_color=pc_df[insertvarhere].cat.codes #need to change insertvarhere\n", - "\n", - "# Plot initialisation\n", - "fig = plt.figure()\n", - "ax = Axes3D(fig) \n", - "ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap='Accent', s=60)\n", - "\n", - "#make simple, bare axis lines through space:\n", - "xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))\n", - "ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')\n", - "yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))\n", - "ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')\n", - "zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))\n", - "ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')\n", - " \n", - "# label the axes\n", - "ax.set_xlabel(\"PC1\")\n", - "ax.set_ylabel(\"PC2\")\n", - "ax.set_zlabel(\"PC3\")\n", - "ax.set_title(\"PCA\")\n", - "#ax.legend()\n", - "plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sns.lmplot( x=\"PC1\", y=\"PC5\",\n", - " data=pc_df, \n", - " fit_reg=False, \n", - " hue=Variable, # color by change variable here\n", - " legend=True,\n", - " scatter_kws={\"s\": 80,'alpha':0.3}) # specify the point size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 234b4d2ef09d9a9441182c5f463ff177e74e3371 Mon Sep 17 00:00:00 2001 From: Howard-nolan <106356427+Howard-nolan@users.noreply.github.com> Date: Sat, 6 Aug 2022 15:58:41 -0400 Subject: [PATCH 2/2] Update README.md --- README.md | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 512d926..4a382c5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Exploratory Data Analysis **Objectives:** -Exploratory Data Analysis is a standard process in the early stages of digital biomarker development. EDA allows us to explore relationships between variables in the data, examine trends, analyze missingness of data, and begin the process of understanding the link between the data and the physiological state we are studying. +Exploratory data analysis (EDA) is the first step in the data analysis process. It involves using a number of tools to get a picture of what’s happening within a dataset. When done right, exploratory data analysis can highlight new aspects of a dataset, inform researchers where to focus their effort, and lead to new conclusions. **Input:** .csv file with entire dataset. @@ -10,21 +10,45 @@ Exploratory Data Analysis is a standard process in the early stages of digital b Figures for EDA (after filtering all the NULL data) **Functions:** -This repository currently contains the following functions. +The EDA has seven main categories of functions: -| Function | README | -| ------ | ------ | -| makehist | Plot histograms of all variables in data | -| makebox | Plot boxplot of all variables in data | -| makeleaf | Plot leafplot of all variables in data | -| makebubble | Plot bubble chart of all variables in data | -| makerun | Plot run chart of all variables in data | -| makemultivariate | Plot multivariate chart of all variables in data | -| makescatter | Plot scatterplot of all variables in data | +- Preliminary EDA +- Covariance matrices +- Missing value analysis +- Distribution visualization +- Data visualizations +- Clustering +- Principal Components Analysis +**Overview:** -**Publications:** +**Preliminary EDA** +Preliminary Exploratory Data Analysis is the most basic overview of the data. Methods like len() and .describe() can allow a researcher to see how large their dataset is and get different statistical descriptions of their dataset. When you’re first reading in a data file to your jupyter notebook, this is a good place to start. + +**Covariance matrices** + +Covariance is a measure of how much two random variables vary together. It’s similar to variance, but where variance tells you how a single variable varies, covariance tells you how two variables vary together. This can give you a sense of what interesting relationships may exist in your data that you can explore further. + +**Missing Value Analysis** + +The missing value analysis section of the repository utilizes the ‘missingno’ package that can be run to see where missing values exist in the data. This is useful if you want to see where potential gaps exist in your dataset. + +**Plot Distribution** + +Plot distribution can give a researcher a sense of the spread of their dataset and the balance of different variables. The EDA Module has methods that allow a researcher to determine the balance of the outcome variable, plot distributions by the outcome class, and outlier analysis. + +**Data Visualizations** + +Data visualization is the largest part of the EDA module. They allow a researcher to understand their data in many ways by providing different views of the dataset. The data visualizations can be manipulated to better fit a particular dataset and create the most useful visualizations. The visualizations in the EDA module utilize the matplotlib library. We have seven different visualizations that a researcher can use to understand their data. + +**Clustering** + +There are various methods for clustering data in the EDA Module. This allows for data to be grouped based on shared attributes using machine learning. The two types of data clustering that the EDA module contains are K-means clustering and hierarchical clustering. + +**Principal Component Analysis (PCA)** + +PCA allows researchers to get a sense of the variables that explain the greatest variance in their data. This can be useful when researchers are deciding what variables they want to focus their effort on. The EDA module provides code for PCA that utilizes methods from the sklearn library and visualizations from matplotlib. #### Code Available Now: