Odd size revamp (#247)

* fixed formatting * updated flake8 check * Fixed test * Fixed test * Removed unncessary dependencies in dev req file * Fixed odd size tests * Fixed mypy error' * Fixed typing syntax * Updated odd size title key * Update info['statistics'] with describe stats * Updated tutorial notebook * Fixed tests
cleanlab · Feb 7, 2024 · 972f060 · 972f060
1 parent 4b4932d
commit 972f060
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 92 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,7 +29,8 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install pytest pytest-cov psutil -e ".[all]"
+          pip install -e ".[all]"
+          pip install -r requirements-dev.txt
         shell: bash
       - name: Test with coverage
         run: pytest --verbose --cov=src/cleanvision/ --cov-config .coveragerc --cov-report=xml tests/

diff --git a/docs/source/tutorials/tutorial.ipynb b/docs/source/tutorials/tutorial.ipynb
@@ -107,7 +107,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 1. Using CleanVision to detect default issue types"
+    "### 1. Using CleanVision to detect issues in your dataset"
    ]
   },
   {
@@ -124,9 +124,6 @@
     "# Initialize imagelab with your dataset\n",
     "imagelab = Imagelab(data_path=dataset_path)\n",
     "\n",
-    "# Visualize a few sample images from the dataset\n",
-    "imagelab.visualize(num_images=8)\n",
-    "\n",
     "# Find issues\n",
     "imagelab.find_issues()"
    ]
@@ -153,17 +150,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The main way to interface with your data is via the `Imagelab` class. This class can be used to understand the issues in your dataset at a high level (global overview) and low level (issues and quality scores for each image) as well as additional information about the dataset. It has three main attributes:\n",
+    "The main way to interface with your data is via the [Imagelab](https://cleanvision.readthedocs.io/en/latest/cleanvision/imagelab.html#cleanvision.imagelab.Imagelab) class. This class can be used to understand the issues in your dataset at a high level (global overview) and low level (issues and quality scores for each image) as well as additional information about the dataset. It has three main attributes:\n",
+    "\n",
     "- `Imagelab.issue_summary`\n",
     "- `Imagelab.issues`\n",
     "- `Imagelab.info`\n",
     "\n",
     "#### imagelab.issue_summary\n",
-    "Dataframe with global summary of all issue types detected in your dataset and the overall prevalence of each type.\n",
+    "This is a Dataframe containing a comprehensive summary of all detected issue types within your dataset, along with their respective prevalence levels. Each row in this summary includes the following information:\n",
     "\n",
-    "In each row:\\\n",
-    "`issue_type` - name of the issue\\\n",
-    "`num_images` - number of images of that issue type found in the dataset"
+    "`issue_type`: The name of the detected issue.\\\n",
+    "`num_images`: The number of images exhibiting the identified issue within the dataset."
    ]
   },
   {
@@ -301,7 +298,7 @@
     "tags": []
    },
    "source": [
-    "You can see **entropy** values for each image in the dataset as shown below."
+    "You can see **size** statistics for the dataset below. Here we observe, both the 25th and 75th percentile are 256 for the dataset, hence images that are further away from this range are detected as oddly sized."
    ]
   },
   {
@@ -310,7 +307,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "imagelab.info[\"statistics\"][\"entropy\"]"
+    "imagelab.info[\"statistics\"][\"size\"]"
    ]
   },
   {

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,9 +4,6 @@ mypy
 pre-commit
 pytest
 pytest-cov
-pytest-lazy-fixture
-datasets>=2.7.0
-torchvision>=0.12.0
 black
 build
 flake8

diff --git a/src/cleanvision/__init__.py b/src/cleanvision/__init__.py
@@ -1,18 +1,20 @@
 import sys
+from typing import Any, Union
+
 from cleanvision.imagelab import Imagelab as _Imagelab
 
 PYTHON_VERSION_INFO = sys.version_info
 
 
-def get_version() -> str:
+def get_version() -> Union[str, Any]:
     if sys.version_info.major >= 3 and sys.version_info.minor >= 8:
         import importlib.metadata
 
         return importlib.metadata.version("cleanvision")
     else:
         import importlib_metadata
 
-        return importlib_metadata.version("cleanvision")  # type:ignore
+        return importlib_metadata.version("cleanvision")
 
 
 try:

diff --git a/src/cleanvision/imagelab.py b/src/cleanvision/imagelab.py
@@ -502,9 +502,7 @@ def _visualize(
             if show_id:
                 title_info["ids"] = [f"id : {i}" for i in indices]
             if issue_type == IssueType.ODD_SIZE.value:
-                title_info["size"] = [
-                    f"original size: {image.size}" for image in images
-                ]
+                title_info["size"] = [f"size: {image.size}" for image in images]
 
             if images:
                 VizManager.individual_images(

diff --git a/src/cleanvision/issue_managers/image_property.py b/src/cleanvision/issue_managers/image_property.py
@@ -1,10 +1,10 @@
 import math
 from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Union, overload
+from typing import Any, Dict, List, Optional, Union, overload
 
 import numpy as np
 import pandas as pd
-from PIL import ImageStat, ImageFilter
+from PIL import ImageFilter, ImageStat
 from PIL.Image import Image
 
 from cleanvision.issue_managers import IssueType
@@ -48,12 +48,16 @@ def get_scores(
         return
 
     def mark_issue(
-        self, scores: pd.DataFrame, threshold: float, issue_type: str
+        self,
+        scores: pd.DataFrame,
+        issue_type: str,
+        threshold: Optional[float] = None,
     ) -> pd.DataFrame:
         is_issue = pd.DataFrame(index=scores.index)
-        is_issue[get_is_issue_colname(issue_type)] = (
-            scores[get_score_colname(issue_type)] < threshold
-        )
+        is_issue_colname, score_colname = get_is_issue_colname(
+            issue_type
+        ), get_score_colname(issue_type)
+        is_issue[is_issue_colname] = scores[score_colname] < threshold
         return is_issue
 
 
@@ -294,8 +298,8 @@ def calc_color_space(image: Image) -> str:
 
 
 def calc_image_area_sqrt(image: Image) -> float:
-    size = image.size
-    return math.sqrt(size[0] * size[1])
+    w, h = image.size
+    return math.sqrt(w) * math.sqrt(h)
 
 
 class ColorSpaceProperty(ImageProperty):
@@ -326,12 +330,14 @@ def get_scores(
         return scores
 
     def mark_issue(
-        self, scores: pd.DataFrame, threshold: float, issue_type: str
+        self, scores: pd.DataFrame, issue_type: str, threshold: Optional[float] = None
     ) -> pd.DataFrame:
         is_issue = pd.DataFrame(index=scores.index)
-        is_issue[get_is_issue_colname(issue_type)] = (
-            1 - scores[get_score_colname(issue_type)]
-        ).astype("bool")
+        is_issue_colname, score_colname = get_is_issue_colname(
+            issue_type
+        ), get_score_colname(issue_type)
+
+        is_issue[is_issue_colname] = (1 - scores[score_colname]).astype("bool")
         return is_issue
 
 
@@ -344,6 +350,7 @@ def score_columns(self) -> List[str]:
 
     def __init__(self) -> None:
         self._score_columns = [self.name]
+        self.threshold = 0.5  # todo: this ensures that the scores are evenly distributed across the range
 
     def calculate(self, image: Image) -> Dict[str, Union[float, str]]:
         return {self.name: calc_image_area_sqrt(image)}
@@ -352,35 +359,49 @@ def get_scores(
         self,
         raw_scores: pd.DataFrame,
         issue_type: str,
+        iqr_factor: float = 3.0,
         **kwargs: Any,
     ) -> pd.DataFrame:
         super().get_scores(raw_scores, issue_type, **kwargs)
         assert raw_scores is not None
 
-        image_size_scores = raw_scores[self.score_columns[0]]
-        median_image_size = image_size_scores.median()
-        size_ratios = image_size_scores / median_image_size
-
-        # Computing the values of the two divisions
-        size_division_1 = size_ratios
-        size_division_2 = 1.0 / size_ratios
+        size = raw_scores[self.name]
+        q1, q3 = np.percentile(size, [25, 75])
+        size_iqr = q3 - q1
+        min_threshold, max_threshold = (
+            q1 - iqr_factor * size_iqr,
+            q3 + iqr_factor * size_iqr,
+        )
+        mid_threshold = (min_threshold + max_threshold) / 2
+        threshold_gap = max_threshold - min_threshold
+        distance = np.absolute(size - mid_threshold)
+
+        if threshold_gap > 0:
+            norm_value = threshold_gap
+            self.threshold = 0.5
+        elif threshold_gap == 0:
+            norm_value = mid_threshold
+            self.threshold = 1.0
+        else:
+            raise ValueError("threshold_gap should be non negative")
 
-        # Using np.minimum to determine the element-wise minimum value between the two divisions
-        size_scores = np.minimum(size_division_1, size_division_2)
+        norm_dist = distance / norm_value
+        score_values = 1 - np.clip(norm_dist, 0, 1)
 
         scores = pd.DataFrame(index=raw_scores.index)
-        scores[get_score_colname(issue_type)] = size_scores
+        scores[get_score_colname(issue_type)] = score_values
         return scores
 
     def mark_issue(
-        self, scores: pd.DataFrame, threshold: float, issue_type: str
+        self, scores: pd.DataFrame, issue_type: str, threshold: Optional[float] = None
     ) -> pd.DataFrame:
+        threshold = self.threshold if threshold is None else threshold
+        is_issue_colname, score_colname = get_is_issue_colname(
+            issue_type
+        ), get_score_colname(issue_type)
+
         is_issue = pd.DataFrame(index=scores.index)
-        is_issue[get_is_issue_colname(issue_type)] = np.where(
-            scores[get_score_colname(issue_type)] < 1.0 / threshold,
-            True,
-            False,
-        )
+        is_issue[is_issue_colname] = scores[score_colname] < threshold
         return is_issue
 
 

diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py
@@ -1,30 +1,27 @@
 import multiprocessing
-from typing import Dict, Any, List, Set, Optional, Union
+from typing import Any, Dict, List, Optional, Set, Union
 
 import pandas as pd
 from tqdm.auto import tqdm
 
 from cleanvision.dataset.base_dataset import Dataset
-from cleanvision.issue_managers import register_issue_manager, IssueType
+from cleanvision.issue_managers import IssueType, register_issue_manager
 from cleanvision.issue_managers.image_property import (
-    BrightnessProperty,
     AspectRatioProperty,
-    EntropyProperty,
     BlurrinessProperty,
+    BrightnessProperty,
     ColorSpaceProperty,
+    EntropyProperty,
     ImageProperty,
     SizeProperty,
 )
 from cleanvision.utils.base_issue_manager import IssueManager
 from cleanvision.utils.constants import (
     IMAGE_PROPERTY,
-    MAX_PROCS,
     IMAGE_PROPERTY_ISSUE_TYPES_LIST,
+    MAX_PROCS,
 )
-from cleanvision.utils.utils import (
-    get_is_issue_colname,
-    update_df,
-)
+from cleanvision.utils.utils import get_is_issue_colname, update_df
 
 
 def compute_scores(
@@ -72,7 +69,7 @@ def get_default_params(self) -> Dict[str, Any]:
                 "color_threshold": 0.18,
             },
             IssueType.GRAYSCALE.value: {},
-            IssueType.ODD_SIZE.value: {"threshold": 10.0},
+            IssueType.ODD_SIZE.value: {"iqr_factor": 3.0},
         }
 
     def update_params(self, params: Dict[str, Any]) -> None:
@@ -203,11 +200,15 @@ def update_issues(
             score_columns = agg_computations[score_column_names]
 
             issue_scores = self.image_properties[issue_type].get_scores(
-                score_columns, issue_type, **self.params[issue_type]
+                raw_scores=score_columns,
+                issue_type=issue_type,
+                **self.params[issue_type],
             )
 
             is_issue = self.image_properties[issue_type].mark_issue(
-                issue_scores, self.params[issue_type].get("threshold"), issue_type
+                scores=issue_scores,
+                issue_type=issue_type,
+                threshold=self.params[issue_type].get("threshold"),
             )
             self.issues = self.issues.join(issue_scores)
             self.issues = self.issues.join(is_issue)
@@ -240,23 +241,20 @@ def update_info(self, agg_computations: pd.DataFrame) -> None:
             issue_type: self.image_properties[issue_type].name
             for issue_type in self.issue_types
         }
-        issue_columns = {
-            issue_type: [
-                col
-                for col in agg_computations.columns
-                if col.startswith(property_names[issue_type] + "_")
-            ]
-            for issue_type in self.issue_types
-        }
 
         for issue_type in self.issue_types:
-            self.info["statistics"][property_names[issue_type]] = agg_computations[
-                property_names[issue_type]
+            property_name = property_names[issue_type]
+
+            self.info["statistics"][property_name] = agg_computations[
+                property_name
+            ].describe()
+
+            issue_columns = [
+                col for col in agg_computations.columns if col.startswith(property_name)
             ]
+
             self.info[issue_type] = (
-                agg_computations[issue_columns[issue_type]]
-                if len(issue_columns[issue_type]) > 0
-                else {}
+                agg_computations[issue_columns] if len(issue_columns) > 0 else {}
             )
 
     def update_summary(self) -> None:

diff --git a/tests/test_image_property_helpers.py b/tests/test_image_property_helpers.py
@@ -2,18 +2,18 @@
 import pandas as pd
 import pytest
 from PIL import Image
+from pytest import approx
 
 import cleanvision
-import math
 from cleanvision.issue_managers import IssueType
 from cleanvision.issue_managers.image_property import (
     BrightnessProperty,
-    calculate_brightness,
-    get_image_mode,
     calc_aspect_ratio,
+    calc_blurriness,
     calc_entropy,
     calc_image_area_sqrt,
-    calc_blurriness,
+    calculate_brightness,
+    get_image_mode,
 )
 from cleanvision.utils.utils import get_is_issue_colname, get_score_colname
 
@@ -54,8 +54,8 @@ def test_calc_bluriness():
 
 def test_calc_area():
     img = Image.new("RGB", (200, 200), (255, 0, 0))
-    area = calc_image_area_sqrt(img)  # img.size[0] * img.size[1]
-    assert area == math.sqrt(200 * 200)
+    area = calc_image_area_sqrt(img)
+    assert area == approx(200)
 
 
 @pytest.mark.parametrize(
@@ -137,5 +137,5 @@ def test_get_scores(self, image_property, issue_type, expected_output):
         ],
     )
     def test_mark_issue(self, image_property, scores, threshold, expected_mark):
-        mark = image_property.mark_issue(scores, threshold, "fake_issue")
+        mark = image_property.mark_issue(scores, "fake_issue", threshold)
         assert all(mark == expected_mark)