From cdaeb063374957cda5a288e2b13501576eab54b1 Mon Sep 17 00:00:00 2001 From: Ismail Bhinderwala Date: Thu, 23 Jan 2025 15:20:47 -0800 Subject: [PATCH] Added imports in init. Changed Readme as well --- README.md | 2 +- docs/example.ipynb | 199 +++++++++++++++++++++- poetry.lock | 6 + src/datpro/__init__.py | 6 +- src/datpro/{dataprofiler.py => datpro.py} | 0 tests/test_detect_anomalies.py | 2 +- tests/test_plotify.py | 2 +- 7 files changed, 207 insertions(+), 10 deletions(-) rename src/datpro/{dataprofiler.py => datpro.py} (100%) diff --git a/README.md b/README.md index 7d488d0..445e42e 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The `datpro` package simplifies data profiling by providing essential functions - `detect_anomalies()`: Detects anomalies in the dataset by identifying missing values, outliers, and duplicates. It calculates the percentage of missing data for each column, detects numerical outliers using the interquartile range (IQR) method, and identifies duplicate rows. This function helps in understanding and addressing potential data quality issues in the dataset. -- `plotify()` : Makes it easy to visualize your DataFrame by automatically creating the right type of plots based on your data. It handles different column types—numeric, categorical, binary, or text—and generates appropriate visualizations like scatter plots for numeric data, box plots for numeric vs. categorical data, and bar charts for categorical data. For text columns, it even creates a word cloud to highlight key terms. +- `plotify()`: A versatile function that simplifies DataFrame visualization by automatically generating appropriate plots based on the data types of your columns. It supports various plot types, including histograms and density plots for numeric data, bar charts for categorical data, scatter plots for pairwise numeric relationships, correlation heatmaps for exploring numeric variable relationships, and box plots for numeric vs. categorical comparisons. For pairwise categorical columns, it generates stacked bar charts. The function dynamically analyzes your DataFrame and provides insightful visualizations tailored to your data structure, making exploratory data analysis efficient and comprehensive. This package provides tools for exploring and cleaning data by summarizing statistics, detecting anomalies, and creating visualizations. While other tools like `ydata-profiling` (https://docs.profiling.ydata.ai/latest/) offer similar functionalities and generate detailed reports, this package focuses on providing modular functions that can be easily integrated into custom workflows. diff --git a/docs/example.ipynb b/docs/example.ipynb index 9c72d1d..fff2cbb 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -11,19 +11,206 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "import datpro\n", - "\n", - "print(datpro.__version__)" + "import datpro as dp\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeIncomeSpending_ScoreGenderRegion
066NaN26.373678MaleSouth
16566369.65180920.906870FemaleSouth
25970764.09227847.990597MaleWest
36441432.31515331.120625FemaleNorth
45352963.99407012.016596FemaleEast
..................
10051862455.03724822.795113FemaleNorth
10062435361.90120518.846863MaleSouth
10075156554.07254617.076530FemaleSouth
10086352799.13684742.219961MaleEast
10094252727.99382627.395330MaleEast
\n", + "

1010 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Age Income Spending_Score Gender Region\n", + "0 66 NaN 26.373678 Male South\n", + "1 65 66369.651809 20.906870 Female South\n", + "2 59 70764.092278 47.990597 Male West\n", + "3 64 41432.315153 31.120625 Female North\n", + "4 53 52963.994070 12.016596 Female East\n", + "... ... ... ... ... ...\n", + "1005 18 62455.037248 22.795113 Female North\n", + "1006 24 35361.901205 18.846863 Male South\n", + "1007 51 56554.072546 17.076530 Female South\n", + "1008 63 52799.136847 42.219961 Male East\n", + "1009 42 52727.993826 27.395330 Male East\n", + "\n", + "[1010 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('../data/example_data.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'missing_values': {'Income': {'missing_count': 50,\n", + " 'missing_percentage': np.float64(4.95)},\n", + " 'Spending_Score': {'missing_count': 30,\n", + " 'missing_percentage': np.float64(2.97)}},\n", + " 'outliers': {'Income': {'outlier_count': 24, 'outlier_percentage': 2.38},\n", + " 'Spending_Score': {'outlier_count': 4, 'outlier_percentage': 0.4}},\n", + " 'duplicates': {'duplicate_count': np.int64(10),\n", + " 'duplicate_percentage': np.float64(0.99)}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dp.detect_anomalies(df)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dataprofiler", "language": "python", "name": "python3" }, @@ -37,7 +224,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/poetry.lock b/poetry.lock index 84bdbcd..e79be1e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3590,19 +3590,25 @@ files = [ {file = "SQLAlchemy-2.0.37-cp313-cp313-win32.whl", hash = "sha256:293f9ade06b2e68dd03cfb14d49202fac47b7bb94bffcff174568c951fbc7af2"}, {file = "SQLAlchemy-2.0.37-cp313-cp313-win_amd64.whl", hash = "sha256:d70f53a0646cc418ca4853da57cf3ddddbccb8c98406791f24426f2dd77fd0e2"}, {file = "SQLAlchemy-2.0.37-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:44f569d0b1eb82301b92b72085583277316e7367e038d97c3a1a899d9a05e342"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2eae3423e538c10d93ae3e87788c6a84658c3ed6db62e6a61bb9495b0ad16bb"}, {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfff7be361048244c3aa0f60b5e63221c5e0f0e509f4e47b8910e22b57d10ae7"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:5bc3339db84c5fb9130ac0e2f20347ee77b5dd2596ba327ce0d399752f4fce39"}, {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:84b9f23b0fa98a6a4b99d73989350a94e4a4ec476b9a7dfe9b79ba5939f5e80b"}, {file = "SQLAlchemy-2.0.37-cp37-cp37m-win32.whl", hash = "sha256:51bc9cfef83e0ac84f86bf2b10eaccb27c5a3e66a1212bef676f5bee6ef33ebb"}, {file = "SQLAlchemy-2.0.37-cp37-cp37m-win_amd64.whl", hash = "sha256:8e47f1af09444f87c67b4f1bb6231e12ba6d4d9f03050d7fc88df6d075231a49"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6b788f14c5bb91db7f468dcf76f8b64423660a05e57fe277d3f4fad7b9dcb7ce"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521ef85c04c33009166777c77e76c8a676e2d8528dc83a57836b63ca9c69dcd1"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75311559f5c9881a9808eadbeb20ed8d8ba3f7225bef3afed2000c2a9f4d49b9"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cce918ada64c956b62ca2c2af59b125767097ec1dca89650a6221e887521bfd7"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9d087663b7e1feabea8c578d6887d59bb00388158e8bff3a76be11aa3f748ca2"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cf95a60b36997dad99692314c4713f141b61c5b0b4cc5c3426faad570b31ca01"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-win32.whl", hash = "sha256:d75ead7dd4d255068ea0f21492ee67937bd7c90964c8f3c2bea83c7b7f81b95f"}, {file = "SQLAlchemy-2.0.37-cp38-cp38-win_amd64.whl", hash = "sha256:74bbd1d0a9bacf34266a7907d43260c8d65d31d691bb2356f41b17c2dca5b1d0"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:648ec5acf95ad59255452ef759054f2176849662af4521db6cb245263ae4aa33"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:35bd2df269de082065d4b23ae08502a47255832cc3f17619a5cea92ce478b02b"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f581d365af9373a738c49e0c51e8b18e08d8a6b1b15cc556773bcd8a192fa8b"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82df02816c14f8dc9f4d74aea4cb84a92f4b0620235daa76dde002409a3fbb5a"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94b564e38b344d3e67d2e224f0aec6ba09a77e4582ced41e7bfd0f757d926ec9"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:955a2a765aa1bd81aafa69ffda179d4fe3e2a3ad462a736ae5b6f387f78bfeb8"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-win32.whl", hash = "sha256:03f0528c53ca0b67094c4764523c1451ea15959bbf0a8a8a3096900014db0278"}, {file = "SQLAlchemy-2.0.37-cp39-cp39-win_amd64.whl", hash = "sha256:4b12885dc85a2ab2b7d00995bac6d967bffa8594123b02ed21e8eb2205a7584b"}, diff --git a/src/datpro/__init__.py b/src/datpro/__init__.py index 3fb7e66..fac2351 100644 --- a/src/datpro/__init__.py +++ b/src/datpro/__init__.py @@ -1,3 +1,7 @@ # read version from installed package from importlib.metadata import version -__version__ = version("datpro") \ No newline at end of file +__version__ = version("datpro") + +from datpro.datpro import detect_anomalies +from datpro.datpro import plotify +from datpro.datpro import summarize_data \ No newline at end of file diff --git a/src/datpro/dataprofiler.py b/src/datpro/datpro.py similarity index 100% rename from src/datpro/dataprofiler.py rename to src/datpro/datpro.py diff --git a/tests/test_detect_anomalies.py b/tests/test_detect_anomalies.py index 07b03b4..161c279 100644 --- a/tests/test_detect_anomalies.py +++ b/tests/test_detect_anomalies.py @@ -1,6 +1,6 @@ import pytest import pandas as pd -from datpro.dataprofiler import detect_anomalies +from datpro.datpro import detect_anomalies def test_missing_values(): """Test if missing values are detected correctly.""" diff --git a/tests/test_plotify.py b/tests/test_plotify.py index 8c9d233..2aa7b48 100644 --- a/tests/test_plotify.py +++ b/tests/test_plotify.py @@ -1,4 +1,4 @@ -from datpro.dataprofiler import plotify +from datpro.datpro import plotify import pytest import pandas as pd