Added imports in init. Changed Readme as well

UBC-MDS · Jan 23, 2025 · cdaeb06 · cdaeb06
1 parent 98f9126
commit cdaeb06
Show file tree

Hide file tree

Showing 7 changed files with 207 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ The `datpro` package simplifies data profiling by providing essential functions
 
 - `detect_anomalies()`: Detects anomalies in the dataset by identifying missing values, outliers, and duplicates. It calculates the percentage of missing data for each column, detects numerical outliers using the interquartile range (IQR) method, and identifies duplicate rows. This function helps in understanding and addressing potential data quality issues in the dataset.
 
-- `plotify()` : Makes it easy to visualize your DataFrame by automatically creating the right type of plots based on your data. It handles different column types—numeric, categorical, binary, or text—and generates appropriate visualizations like scatter plots for numeric data, box plots for numeric vs. categorical data, and bar charts for categorical data. For text columns, it even creates a word cloud to highlight key terms.
+- `plotify()`: A versatile function that simplifies DataFrame visualization by automatically generating appropriate plots based on the data types of your columns. It supports various plot types, including histograms and density plots for numeric data, bar charts for categorical data, scatter plots for pairwise numeric relationships, correlation heatmaps for exploring numeric variable relationships, and box plots for numeric vs. categorical comparisons. For pairwise categorical columns, it generates stacked bar charts. The function dynamically analyzes your DataFrame and provides insightful visualizations tailored to your data structure, making exploratory data analysis efficient and comprehensive.
 
 This package provides tools for exploring and cleaning data by summarizing statistics, detecting anomalies, and creating visualizations. While other tools like `ydata-profiling` (https://docs.profiling.ydata.ai/latest/) offer similar functionalities and generate detailed reports, this package focuses on providing modular functions that can be easily integrated into custom workflows.
 

diff --git a/docs/example.ipynb b/docs/example.ipynb
@@ -11,19 +11,206 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 4,
             "metadata": {},
             "outputs": [],
             "source": [
-                "import datpro\n",
-                "\n",
-                "print(datpro.__version__)"
+                "import datpro as dp\n",
+                "import pandas as pd"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/html": [
+                            "<div>\n",
+                            "<style scoped>\n",
+                            "    .dataframe tbody tr th:only-of-type {\n",
+                            "        vertical-align: middle;\n",
+                            "    }\n",
+                            "\n",
+                            "    .dataframe tbody tr th {\n",
+                            "        vertical-align: top;\n",
+                            "    }\n",
+                            "\n",
+                            "    .dataframe thead th {\n",
+                            "        text-align: right;\n",
+                            "    }\n",
+                            "</style>\n",
+                            "<table border=\"1\" class=\"dataframe\">\n",
+                            "  <thead>\n",
+                            "    <tr style=\"text-align: right;\">\n",
+                            "      <th></th>\n",
+                            "      <th>Age</th>\n",
+                            "      <th>Income</th>\n",
+                            "      <th>Spending_Score</th>\n",
+                            "      <th>Gender</th>\n",
+                            "      <th>Region</th>\n",
+                            "    </tr>\n",
+                            "  </thead>\n",
+                            "  <tbody>\n",
+                            "    <tr>\n",
+                            "      <th>0</th>\n",
+                            "      <td>66</td>\n",
+                            "      <td>NaN</td>\n",
+                            "      <td>26.373678</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1</th>\n",
+                            "      <td>65</td>\n",
+                            "      <td>66369.651809</td>\n",
+                            "      <td>20.906870</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>2</th>\n",
+                            "      <td>59</td>\n",
+                            "      <td>70764.092278</td>\n",
+                            "      <td>47.990597</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>West</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>3</th>\n",
+                            "      <td>64</td>\n",
+                            "      <td>41432.315153</td>\n",
+                            "      <td>31.120625</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>North</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>4</th>\n",
+                            "      <td>53</td>\n",
+                            "      <td>52963.994070</td>\n",
+                            "      <td>12.016596</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>...</th>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "      <td>...</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1005</th>\n",
+                            "      <td>18</td>\n",
+                            "      <td>62455.037248</td>\n",
+                            "      <td>22.795113</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>North</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1006</th>\n",
+                            "      <td>24</td>\n",
+                            "      <td>35361.901205</td>\n",
+                            "      <td>18.846863</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1007</th>\n",
+                            "      <td>51</td>\n",
+                            "      <td>56554.072546</td>\n",
+                            "      <td>17.076530</td>\n",
+                            "      <td>Female</td>\n",
+                            "      <td>South</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1008</th>\n",
+                            "      <td>63</td>\n",
+                            "      <td>52799.136847</td>\n",
+                            "      <td>42.219961</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "    <tr>\n",
+                            "      <th>1009</th>\n",
+                            "      <td>42</td>\n",
+                            "      <td>52727.993826</td>\n",
+                            "      <td>27.395330</td>\n",
+                            "      <td>Male</td>\n",
+                            "      <td>East</td>\n",
+                            "    </tr>\n",
+                            "  </tbody>\n",
+                            "</table>\n",
+                            "<p>1010 rows × 5 columns</p>\n",
+                            "</div>"
+                        ],
+                        "text/plain": [
+                            "      Age        Income  Spending_Score  Gender Region\n",
+                            "0      66           NaN       26.373678    Male  South\n",
+                            "1      65  66369.651809       20.906870  Female  South\n",
+                            "2      59  70764.092278       47.990597    Male   West\n",
+                            "3      64  41432.315153       31.120625  Female  North\n",
+                            "4      53  52963.994070       12.016596  Female   East\n",
+                            "...   ...           ...             ...     ...    ...\n",
+                            "1005   18  62455.037248       22.795113  Female  North\n",
+                            "1006   24  35361.901205       18.846863    Male  South\n",
+                            "1007   51  56554.072546       17.076530  Female  South\n",
+                            "1008   63  52799.136847       42.219961    Male   East\n",
+                            "1009   42  52727.993826       27.395330    Male   East\n",
+                            "\n",
+                            "[1010 rows x 5 columns]"
+                        ]
+                    },
+                    "execution_count": 5,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "df =  pd.read_csv('../data/example_data.csv')\n",
+                "df"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'missing_values': {'Income': {'missing_count': 50,\n",
+                            "   'missing_percentage': np.float64(4.95)},\n",
+                            "  'Spending_Score': {'missing_count': 30,\n",
+                            "   'missing_percentage': np.float64(2.97)}},\n",
+                            " 'outliers': {'Income': {'outlier_count': 24, 'outlier_percentage': 2.38},\n",
+                            "  'Spending_Score': {'outlier_count': 4, 'outlier_percentage': 0.4}},\n",
+                            " 'duplicates': {'duplicate_count': np.int64(10),\n",
+                            "  'duplicate_percentage': np.float64(0.99)}}"
+                        ]
+                    },
+                    "execution_count": 7,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "dp.detect_anomalies(df)"
             ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": []
         }
     ],
     "metadata": {
         "kernelspec": {
-            "display_name": "Python 3",
+            "display_name": "dataprofiler",
             "language": "python",
             "name": "python3"
         },
@@ -37,7 +224,7 @@
             "name": "python",
             "nbconvert_exporter": "python",
             "pygments_lexer": "ipython3",
-            "version": "3.8.5"
+            "version": "3.11.0"
         }
     },
     "nbformat": 4,

diff --git a/poetry.lock b/poetry.lock
diff --git a/src/datpro/__init__.py b/src/datpro/__init__.py
@@ -1,3 +1,7 @@
 # read version from installed package
 from importlib.metadata import version
-__version__ = version("datpro")
+__version__ = version("datpro")
+
+from datpro.datpro import detect_anomalies
+from datpro.datpro import plotify
+from datpro.datpro import summarize_data
diff --git a/src/datpro/dataprofiler.py → src/datpro/datpro.py b/src/datpro/dataprofiler.py → src/datpro/datpro.py
diff --git a/tests/test_detect_anomalies.py b/tests/test_detect_anomalies.py
@@ -1,6 +1,6 @@
 import pytest
 import pandas as pd
-from datpro.dataprofiler import detect_anomalies
+from datpro.datpro import detect_anomalies
 
 def test_missing_values():
     """Test if missing values are detected correctly."""

diff --git a/tests/test_plotify.py b/tests/test_plotify.py
@@ -1,4 +1,4 @@
-from datpro.dataprofiler import plotify
+from datpro.datpro import plotify
 import pytest
 import pandas as pd