From cdaeb063374957cda5a288e2b13501576eab54b1 Mon Sep 17 00:00:00 2001
From: Ismail Bhinderwala <husmail786@gmail.com>
Date: Thu, 23 Jan 2025 15:20:47 -0800
Subject: [PATCH] Added imports in init. Changed Readme as well

---
 README.md                                 |   2 +-
 docs/example.ipynb                        | 199 +++++++++++++++++++++-
 poetry.lock                               |   6 +
 src/datpro/__init__.py                    |   6 +-
 src/datpro/{dataprofiler.py => datpro.py} |   0
 tests/test_detect_anomalies.py            |   2 +-
 tests/test_plotify.py                     |   2 +-
 7 files changed, 207 insertions(+), 10 deletions(-)
 rename src/datpro/{dataprofiler.py => datpro.py} (100%)
diff --git a/README.md b/README.md
index 7d488d0..445e42e 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ The `datpro` package simplifies data profiling by providing essential functions
 
 - `detect_anomalies()`: Detects anomalies in the dataset by identifying missing values, outliers, and duplicates. It calculates the percentage of missing data for each column, detects numerical outliers using the interquartile range (IQR) method, and identifies duplicate rows. This function helps in understanding and addressing potential data quality issues in the dataset.
 
-- `plotify()` : Makes it easy to visualize your DataFrame by automatically creating the right type of plots based on your data. It handles different column types—numeric, categorical, binary, or text—and generates appropriate visualizations like scatter plots for numeric data, box plots for numeric vs. categorical data, and bar charts for categorical data. For text columns, it even creates a word cloud to highlight key terms.
+- `plotify()`: A versatile function that simplifies DataFrame visualization by automatically generating appropriate plots based on the data types of your columns. It supports various plot types, including histograms and density plots for numeric data, bar charts for categorical data, scatter plots for pairwise numeric relationships, correlation heatmaps for exploring numeric variable relationships, and box plots for numeric vs. categorical comparisons. For pairwise categorical columns, it generates stacked bar charts. The function dynamically analyzes your DataFrame and provides insightful visualizations tailored to your data structure, making exploratory data analysis efficient and comprehensive.
 
 This package provides tools for exploring and cleaning data by summarizing statistics, detecting anomalies, and creating visualizations. While other tools like `ydata-profiling` (https://docs.profiling.ydata.ai/latest/) offer similar functionalities and generate detailed reports, this package focuses on providing modular functions that can be easily integrated into custom workflows.
 
diff --git a/docs/example.ipynb b/docs/example.ipynb
index 9c72d1d..fff2cbb 100644
--- a/docs/example.ipynb
+++ b/docs/example.ipynb
@@ -11,19 +11,206 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 4,
             "metadata": {},
             "outputs": [],
             "source": [
-                "import datpro\n",
-                "\n",
-                "print(datpro.__version__)"
+                "import datpro as dp\n",
+                "import pandas as pd"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/html": [
+                            "<div>\n",
+                            "<style scoped>\n",
+                            "    .dataframe tbody tr th:only-of-type {\n",
+                            "        vertical-align: middle;\n",
+                            "    }\n",
+                            "\n",
+                            "    .dataframe tbody tr th {\n",
+                            "        vertical-align: top;\n",
+                            "    }\n",
+                            "\n",
+                            "    .dataframe thead th {\n",
+                            "        text-align: right;\n",
+                            "    }\n",
+                            "</style>\n",
+                            "<table border=\"1\" class=\"dataframe\">\n",
+                            "  <thead>\n",
+                            "    <tr style=\"text-align: right;\">\n",
+                            "      <th></th>\n",
+                            "      <th>Age</th>\n",
+                            "      <th>Income</th>\n",
+                            "      <th>Spending_Score</th>\n",
+                            "      <th>Gender</th>\n",
+                            "      <th>Region</th>\n",
+                            "    </tr>\n",
+                            "  </thead>\n",
+                            "  <tbody>\n",
+                            "    <tr>\n",
+                            "      <th>0</th>\n",
+                            "      <td>66</td>\n",
+                            "      <td>NaN</td>\n",
+                            "      <td>26.373678</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1</th>\n",
+                            "      <td>65</td>\n",
+                            "      <td>66369.651809</td>\n",
+                            "      <td>20.906870</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>2</th>\n",
+                            "      <td>59</td>\n",
+                            "      <td>70764.092278</td>\n",
+                            "      <td>47.990597</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>West</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>3</th>\n",
+                            "      <td>64</td>\n",
+                            "      <td>41432.315153</td>\n",
+                            "      <td>31.120625</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>North</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>4</th>\n",
+                            "      <td>53</td>\n",
+                            "      <td>52963.994070</td>\n",
+                            "      <td>12.016596</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>...</th>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1005</th>\n",
+                            "      <td>18</td>\n",
+                            "      <td>62455.037248</td>\n",
+                            "      <td>22.795113</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>North</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1006</th>\n",
+                            "      <td>24</td>\n",
+                            "      <td>35361.901205</td>\n",
+                            "      <td>18.846863</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1007</th>\n",
+                            "      <td>51</td>\n",
+                            "      <td>56554.072546</td>\n",
+                            "      <td>17.076530</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1008</th>\n",
+                            "      <td>63</td>\n",
+                            "      <td>52799.136847</td>\n",
+                            "      <td>42.219961</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1009</th>\n",
+                            "      <td>42</td>\n",
+                            "      <td>52727.993826</td>\n",
+                            "      <td>27.395330</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "  </tbody>\n",
+                            "</table>\n",
+                            "<p>1010 rows × 5 columns</p>\n",
+                            "</div>"
+                        ],
+                        "text/plain": [
+                            "      Age        Income  Spending_Score  Gender Region\n",
+                            "0      66           NaN       26.373678    Male  South\n",
+                            "1      65  66369.651809       20.906870  Female  South\n",
+                            "2      59  70764.092278       47.990597    Male   West\n",
+                            "3      64  41432.315153       31.120625  Female  North\n",
+                            "4      53  52963.994070       12.016596  Female   East\n",
+                            "...   ...           ...             ...     ...    ...\n",
+                            "1005   18  62455.037248       22.795113  Female  North\n",
+                            "1006   24  35361.901205       18.846863    Male  South\n",
+                            "1007   51  56554.072546       17.076530  Female  South\n",
+                            "1008   63  52799.136847       42.219961    Male   East\n",
+                            "1009   42  52727.993826       27.395330    Male   East\n",
+                            "\n",
+                            "[1010 rows x 5 columns]"
+                        ]
+                    },
+                    "execution_count": 5,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "df =  pd.read_csv('../data/example_data.csv')\n",
+                "df"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'missing_values': {'Income': {'missing_count': 50,\n",
+                            "   'missing_percentage': np.float64(4.95)},\n",
+                            "  'Spending_Score': {'missing_count': 30,\n",
+                            "   'missing_percentage': np.float64(2.97)}},\n",
+                            " 'outliers': {'Income': {'outlier_count': 24, 'outlier_percentage': 2.38},\n",
+                            "  'Spending_Score': {'outlier_count': 4, 'outlier_percentage': 0.4}},\n",
+                            " 'duplicates': {'duplicate_count': np.int64(10),\n",
+                            "  'duplicate_percentage': np.float64(0.99)}}"
+                        ]
+                    },
+                    "execution_count": 7,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "dp.detect_anomalies(df)"
             ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": []
         }
     ],
     "metadata": {
         "kernelspec": {
-            "display_name": "Python 3",
+            "display_name": "dataprofiler",
             "language": "python",
             "name": "python3"
         },
@@ -37,7 +224,7 @@
             "name": "python",
             "nbconvert_exporter": "python",
             "pygments_lexer": "ipython3",
-            "version": "3.8.5"
+            "version": "3.11.0"
         }
     },
     "nbformat": 4,
diff --git a/poetry.lock b/poetry.lock
index 84bdbcd..e79be1e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3590,19 +3590,25 @@ files = [
     {file = "SQLAlchemy-2.0.37-cp313-cp313-win32.whl", hash = "sha256:293f9ade06b2e68dd03cfb14d49202fac47b7bb94bffcff174568c951fbc7af2"},
     {file = "SQLAlchemy-2.0.37-cp313-cp313-win_amd64.whl", hash = "sha256:d70f53a0646cc418ca4853da57cf3ddddbccb8c98406791f24426f2dd77fd0e2"},
     {file = "SQLAlchemy-2.0.37-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:44f569d0b1eb82301b92b72085583277316e7367e038d97c3a1a899d9a05e342"},
+    {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2eae3423e538c10d93ae3e87788c6a84658c3ed6db62e6a61bb9495b0ad16bb"},
     {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfff7be361048244c3aa0f60b5e63221c5e0f0e509f4e47b8910e22b57d10ae7"},
+    {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:5bc3339db84c5fb9130ac0e2f20347ee77b5dd2596ba327ce0d399752f4fce39"},
     {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:84b9f23b0fa98a6a4b99d73989350a94e4a4ec476b9a7dfe9b79ba5939f5e80b"},
     {file = "SQLAlchemy-2.0.37-cp37-cp37m-win32.whl", hash = "sha256:51bc9cfef83e0ac84f86bf2b10eaccb27c5a3e66a1212bef676f5bee6ef33ebb"},
     {file = "SQLAlchemy-2.0.37-cp37-cp37m-win_amd64.whl", hash = "sha256:8e47f1af09444f87c67b4f1bb6231e12ba6d4d9f03050d7fc88df6d075231a49"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6b788f14c5bb91db7f468dcf76f8b64423660a05e57fe277d3f4fad7b9dcb7ce"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521ef85c04c33009166777c77e76c8a676e2d8528dc83a57836b63ca9c69dcd1"},
+    {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75311559f5c9881a9808eadbeb20ed8d8ba3f7225bef3afed2000c2a9f4d49b9"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cce918ada64c956b62ca2c2af59b125767097ec1dca89650a6221e887521bfd7"},
+    {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9d087663b7e1feabea8c578d6887d59bb00388158e8bff3a76be11aa3f748ca2"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cf95a60b36997dad99692314c4713f141b61c5b0b4cc5c3426faad570b31ca01"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-win32.whl", hash = "sha256:d75ead7dd4d255068ea0f21492ee67937bd7c90964c8f3c2bea83c7b7f81b95f"},
     {file = "SQLAlchemy-2.0.37-cp38-cp38-win_amd64.whl", hash = "sha256:74bbd1d0a9bacf34266a7907d43260c8d65d31d691bb2356f41b17c2dca5b1d0"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:648ec5acf95ad59255452ef759054f2176849662af4521db6cb245263ae4aa33"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:35bd2df269de082065d4b23ae08502a47255832cc3f17619a5cea92ce478b02b"},
+    {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f581d365af9373a738c49e0c51e8b18e08d8a6b1b15cc556773bcd8a192fa8b"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82df02816c14f8dc9f4d74aea4cb84a92f4b0620235daa76dde002409a3fbb5a"},
+    {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94b564e38b344d3e67d2e224f0aec6ba09a77e4582ced41e7bfd0f757d926ec9"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:955a2a765aa1bd81aafa69ffda179d4fe3e2a3ad462a736ae5b6f387f78bfeb8"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-win32.whl", hash = "sha256:03f0528c53ca0b67094c4764523c1451ea15959bbf0a8a8a3096900014db0278"},
     {file = "SQLAlchemy-2.0.37-cp39-cp39-win_amd64.whl", hash = "sha256:4b12885dc85a2ab2b7d00995bac6d967bffa8594123b02ed21e8eb2205a7584b"},
diff --git a/src/datpro/__init__.py b/src/datpro/__init__.py
index 3fb7e66..fac2351 100644
--- a/src/datpro/__init__.py
+++ b/src/datpro/__init__.py
@@ -1,3 +1,7 @@
 # read version from installed package
 from importlib.metadata import version
-__version__ = version("datpro")
\ No newline at end of file
+__version__ = version("datpro")
+
+from datpro.datpro import detect_anomalies
+from datpro.datpro import plotify
+from datpro.datpro import summarize_data
\ No newline at end of file
diff --git a/src/datpro/dataprofiler.py b/src/datpro/datpro.py
similarity index 100%
rename from src/datpro/dataprofiler.py
rename to src/datpro/datpro.py
diff --git a/tests/test_detect_anomalies.py b/tests/test_detect_anomalies.py
index 07b03b4..161c279 100644
--- a/tests/test_detect_anomalies.py
+++ b/tests/test_detect_anomalies.py
@@ -1,6 +1,6 @@
 import pytest
 import pandas as pd
-from datpro.dataprofiler import detect_anomalies
+from datpro.datpro import detect_anomalies
 
 def test_missing_values():
     """Test if missing values are detected correctly."""
diff --git a/tests/test_plotify.py b/tests/test_plotify.py
index 8c9d233..2aa7b48 100644
--- a/tests/test_plotify.py
+++ b/tests/test_plotify.py
@@ -1,4 +1,4 @@
-from datpro.dataprofiler import plotify
+from datpro.datpro import plotify
 import pytest
 import pandas as pd 
 

	Age	Income	Spending_Score	Gender	Region
0	66	NaN	26.373678	Male	South
1	65	66369.651809	20.906870	Female	South
2	59	70764.092278	47.990597	Male	West
3	64	41432.315153	31.120625	Female	North
4	53	52963.994070	12.016596	Female	East
...	...	...	...	...	...
1005	18	62455.037248	22.795113	Female	North
1006	24	35361.901205	18.846863	Male	South
1007	51	56554.072546	17.076530	Female	South
1008	63	52799.136847	42.219961	Male	East
1009	42	52727.993826	27.395330	Male	East