Skip to content

Commit

Permalink
Odd size revamp (#247)
Browse files Browse the repository at this point in the history
* fixed formatting

* updated flake8 check

* Fixed test

* Fixed test

* Removed unncessary dependencies in dev req file

* Fixed odd size tests

* Fixed mypy error'

* Fixed typing syntax

* Updated odd size title key

* Update info['statistics'] with describe stats

* Updated tutorial notebook

* Fixed tests
  • Loading branch information
sanjanag authored Feb 7, 2024
1 parent 4b4932d commit 972f060
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 92 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pytest-cov psutil -e ".[all]"
pip install -e ".[all]"
pip install -r requirements-dev.txt
shell: bash
- name: Test with coverage
run: pytest --verbose --cov=src/cleanvision/ --cov-config .coveragerc --cov-report=xml tests/
Expand Down
19 changes: 8 additions & 11 deletions docs/source/tutorials/tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Using CleanVision to detect default issue types"
"### 1. Using CleanVision to detect issues in your dataset"
]
},
{
Expand All @@ -124,9 +124,6 @@
"# Initialize imagelab with your dataset\n",
"imagelab = Imagelab(data_path=dataset_path)\n",
"\n",
"# Visualize a few sample images from the dataset\n",
"imagelab.visualize(num_images=8)\n",
"\n",
"# Find issues\n",
"imagelab.find_issues()"
]
Expand All @@ -153,17 +150,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The main way to interface with your data is via the `Imagelab` class. This class can be used to understand the issues in your dataset at a high level (global overview) and low level (issues and quality scores for each image) as well as additional information about the dataset. It has three main attributes:\n",
"The main way to interface with your data is via the [Imagelab](https://cleanvision.readthedocs.io/en/latest/cleanvision/imagelab.html#cleanvision.imagelab.Imagelab) class. This class can be used to understand the issues in your dataset at a high level (global overview) and low level (issues and quality scores for each image) as well as additional information about the dataset. It has three main attributes:\n",
"\n",
"- `Imagelab.issue_summary`\n",
"- `Imagelab.issues`\n",
"- `Imagelab.info`\n",
"\n",
"#### imagelab.issue_summary\n",
"Dataframe with global summary of all issue types detected in your dataset and the overall prevalence of each type.\n",
"This is a Dataframe containing a comprehensive summary of all detected issue types within your dataset, along with their respective prevalence levels. Each row in this summary includes the following information:\n",
"\n",
"In each row:\\\n",
"`issue_type` - name of the issue\\\n",
"`num_images` - number of images of that issue type found in the dataset"
"`issue_type`: The name of the detected issue.\\\n",
"`num_images`: The number of images exhibiting the identified issue within the dataset."
]
},
{
Expand Down Expand Up @@ -301,7 +298,7 @@
"tags": []
},
"source": [
"You can see **entropy** values for each image in the dataset as shown below."
"You can see **size** statistics for the dataset below. Here we observe, both the 25th and 75th percentile are 256 for the dataset, hence images that are further away from this range are detected as oddly sized."
]
},
{
Expand All @@ -310,7 +307,7 @@
"metadata": {},
"outputs": [],
"source": [
"imagelab.info[\"statistics\"][\"entropy\"]"
"imagelab.info[\"statistics\"][\"size\"]"
]
},
{
Expand Down
3 changes: 0 additions & 3 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ mypy
pre-commit
pytest
pytest-cov
pytest-lazy-fixture
datasets>=2.7.0
torchvision>=0.12.0
black
build
flake8
Expand Down
6 changes: 4 additions & 2 deletions src/cleanvision/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import sys
from typing import Any, Union

from cleanvision.imagelab import Imagelab as _Imagelab

PYTHON_VERSION_INFO = sys.version_info


def get_version() -> str:
def get_version() -> Union[str, Any]:
if sys.version_info.major >= 3 and sys.version_info.minor >= 8:
import importlib.metadata

return importlib.metadata.version("cleanvision")
else:
import importlib_metadata

return importlib_metadata.version("cleanvision") # type:ignore
return importlib_metadata.version("cleanvision")


try:
Expand Down
4 changes: 1 addition & 3 deletions src/cleanvision/imagelab.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,7 @@ def _visualize(
if show_id:
title_info["ids"] = [f"id : {i}" for i in indices]
if issue_type == IssueType.ODD_SIZE.value:
title_info["size"] = [
f"original size: {image.size}" for image in images
]
title_info["size"] = [f"size: {image.size}" for image in images]

if images:
VizManager.individual_images(
Expand Down
77 changes: 49 additions & 28 deletions src/cleanvision/issue_managers/image_property.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import math
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Union, overload
from typing import Any, Dict, List, Optional, Union, overload

import numpy as np
import pandas as pd
from PIL import ImageStat, ImageFilter
from PIL import ImageFilter, ImageStat
from PIL.Image import Image

from cleanvision.issue_managers import IssueType
Expand Down Expand Up @@ -48,12 +48,16 @@ def get_scores(
return

def mark_issue(
self, scores: pd.DataFrame, threshold: float, issue_type: str
self,
scores: pd.DataFrame,
issue_type: str,
threshold: Optional[float] = None,
) -> pd.DataFrame:
is_issue = pd.DataFrame(index=scores.index)
is_issue[get_is_issue_colname(issue_type)] = (
scores[get_score_colname(issue_type)] < threshold
)
is_issue_colname, score_colname = get_is_issue_colname(
issue_type
), get_score_colname(issue_type)
is_issue[is_issue_colname] = scores[score_colname] < threshold
return is_issue


Expand Down Expand Up @@ -294,8 +298,8 @@ def calc_color_space(image: Image) -> str:


def calc_image_area_sqrt(image: Image) -> float:
size = image.size
return math.sqrt(size[0] * size[1])
w, h = image.size
return math.sqrt(w) * math.sqrt(h)


class ColorSpaceProperty(ImageProperty):
Expand Down Expand Up @@ -326,12 +330,14 @@ def get_scores(
return scores

def mark_issue(
self, scores: pd.DataFrame, threshold: float, issue_type: str
self, scores: pd.DataFrame, issue_type: str, threshold: Optional[float] = None
) -> pd.DataFrame:
is_issue = pd.DataFrame(index=scores.index)
is_issue[get_is_issue_colname(issue_type)] = (
1 - scores[get_score_colname(issue_type)]
).astype("bool")
is_issue_colname, score_colname = get_is_issue_colname(
issue_type
), get_score_colname(issue_type)

is_issue[is_issue_colname] = (1 - scores[score_colname]).astype("bool")
return is_issue


Expand All @@ -344,6 +350,7 @@ def score_columns(self) -> List[str]:

def __init__(self) -> None:
self._score_columns = [self.name]
self.threshold = 0.5 # todo: this ensures that the scores are evenly distributed across the range

def calculate(self, image: Image) -> Dict[str, Union[float, str]]:
return {self.name: calc_image_area_sqrt(image)}
Expand All @@ -352,35 +359,49 @@ def get_scores(
self,
raw_scores: pd.DataFrame,
issue_type: str,
iqr_factor: float = 3.0,
**kwargs: Any,
) -> pd.DataFrame:
super().get_scores(raw_scores, issue_type, **kwargs)
assert raw_scores is not None

image_size_scores = raw_scores[self.score_columns[0]]
median_image_size = image_size_scores.median()
size_ratios = image_size_scores / median_image_size

# Computing the values of the two divisions
size_division_1 = size_ratios
size_division_2 = 1.0 / size_ratios
size = raw_scores[self.name]
q1, q3 = np.percentile(size, [25, 75])
size_iqr = q3 - q1
min_threshold, max_threshold = (
q1 - iqr_factor * size_iqr,
q3 + iqr_factor * size_iqr,
)
mid_threshold = (min_threshold + max_threshold) / 2
threshold_gap = max_threshold - min_threshold
distance = np.absolute(size - mid_threshold)

if threshold_gap > 0:
norm_value = threshold_gap
self.threshold = 0.5
elif threshold_gap == 0:
norm_value = mid_threshold
self.threshold = 1.0
else:
raise ValueError("threshold_gap should be non negative")

# Using np.minimum to determine the element-wise minimum value between the two divisions
size_scores = np.minimum(size_division_1, size_division_2)
norm_dist = distance / norm_value
score_values = 1 - np.clip(norm_dist, 0, 1)

scores = pd.DataFrame(index=raw_scores.index)
scores[get_score_colname(issue_type)] = size_scores
scores[get_score_colname(issue_type)] = score_values
return scores

def mark_issue(
self, scores: pd.DataFrame, threshold: float, issue_type: str
self, scores: pd.DataFrame, issue_type: str, threshold: Optional[float] = None
) -> pd.DataFrame:
threshold = self.threshold if threshold is None else threshold
is_issue_colname, score_colname = get_is_issue_colname(
issue_type
), get_score_colname(issue_type)

is_issue = pd.DataFrame(index=scores.index)
is_issue[get_is_issue_colname(issue_type)] = np.where(
scores[get_score_colname(issue_type)] < 1.0 / threshold,
True,
False,
)
is_issue[is_issue_colname] = scores[score_colname] < threshold
return is_issue


Expand Down
48 changes: 23 additions & 25 deletions src/cleanvision/issue_managers/image_property_issue_manager.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,27 @@
import multiprocessing
from typing import Dict, Any, List, Set, Optional, Union
from typing import Any, Dict, List, Optional, Set, Union

import pandas as pd
from tqdm.auto import tqdm

from cleanvision.dataset.base_dataset import Dataset
from cleanvision.issue_managers import register_issue_manager, IssueType
from cleanvision.issue_managers import IssueType, register_issue_manager
from cleanvision.issue_managers.image_property import (
BrightnessProperty,
AspectRatioProperty,
EntropyProperty,
BlurrinessProperty,
BrightnessProperty,
ColorSpaceProperty,
EntropyProperty,
ImageProperty,
SizeProperty,
)
from cleanvision.utils.base_issue_manager import IssueManager
from cleanvision.utils.constants import (
IMAGE_PROPERTY,
MAX_PROCS,
IMAGE_PROPERTY_ISSUE_TYPES_LIST,
MAX_PROCS,
)
from cleanvision.utils.utils import (
get_is_issue_colname,
update_df,
)
from cleanvision.utils.utils import get_is_issue_colname, update_df


def compute_scores(
Expand Down Expand Up @@ -72,7 +69,7 @@ def get_default_params(self) -> Dict[str, Any]:
"color_threshold": 0.18,
},
IssueType.GRAYSCALE.value: {},
IssueType.ODD_SIZE.value: {"threshold": 10.0},
IssueType.ODD_SIZE.value: {"iqr_factor": 3.0},
}

def update_params(self, params: Dict[str, Any]) -> None:
Expand Down Expand Up @@ -203,11 +200,15 @@ def update_issues(
score_columns = agg_computations[score_column_names]

issue_scores = self.image_properties[issue_type].get_scores(
score_columns, issue_type, **self.params[issue_type]
raw_scores=score_columns,
issue_type=issue_type,
**self.params[issue_type],
)

is_issue = self.image_properties[issue_type].mark_issue(
issue_scores, self.params[issue_type].get("threshold"), issue_type
scores=issue_scores,
issue_type=issue_type,
threshold=self.params[issue_type].get("threshold"),
)
self.issues = self.issues.join(issue_scores)
self.issues = self.issues.join(is_issue)
Expand Down Expand Up @@ -240,23 +241,20 @@ def update_info(self, agg_computations: pd.DataFrame) -> None:
issue_type: self.image_properties[issue_type].name
for issue_type in self.issue_types
}
issue_columns = {
issue_type: [
col
for col in agg_computations.columns
if col.startswith(property_names[issue_type] + "_")
]
for issue_type in self.issue_types
}

for issue_type in self.issue_types:
self.info["statistics"][property_names[issue_type]] = agg_computations[
property_names[issue_type]
property_name = property_names[issue_type]

self.info["statistics"][property_name] = agg_computations[
property_name
].describe()

issue_columns = [
col for col in agg_computations.columns if col.startswith(property_name)
]

self.info[issue_type] = (
agg_computations[issue_columns[issue_type]]
if len(issue_columns[issue_type]) > 0
else {}
agg_computations[issue_columns] if len(issue_columns) > 0 else {}
)

def update_summary(self) -> None:
Expand Down
14 changes: 7 additions & 7 deletions tests/test_image_property_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import pandas as pd
import pytest
from PIL import Image
from pytest import approx

import cleanvision
import math
from cleanvision.issue_managers import IssueType
from cleanvision.issue_managers.image_property import (
BrightnessProperty,
calculate_brightness,
get_image_mode,
calc_aspect_ratio,
calc_blurriness,
calc_entropy,
calc_image_area_sqrt,
calc_blurriness,
calculate_brightness,
get_image_mode,
)
from cleanvision.utils.utils import get_is_issue_colname, get_score_colname

Expand Down Expand Up @@ -54,8 +54,8 @@ def test_calc_bluriness():

def test_calc_area():
img = Image.new("RGB", (200, 200), (255, 0, 0))
area = calc_image_area_sqrt(img) # img.size[0] * img.size[1]
assert area == math.sqrt(200 * 200)
area = calc_image_area_sqrt(img)
assert area == approx(200)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -137,5 +137,5 @@ def test_get_scores(self, image_property, issue_type, expected_output):
],
)
def test_mark_issue(self, image_property, scores, threshold, expected_mark):
mark = image_property.mark_issue(scores, threshold, "fake_issue")
mark = image_property.mark_issue(scores, "fake_issue", threshold)
assert all(mark == expected_mark)
Loading

0 comments on commit 972f060

Please sign in to comment.