Skip to content

Commit

Permalink
feat: clip embeddings (#399)
Browse files Browse the repository at this point in the history
* feat: embed with clip

* fix: catch when images can't be loaded properly

* fix: new lock file with dependencies from both jwt and clip

* refactor: base class for embedding models and move embedding files to pfs

* chore: remove unused file

* refactor: metric types

* fix: circular import

* fix: backward compatibility
  • Loading branch information
frederik-encord authored May 16, 2023
1 parent f199e40 commit b91e255
Show file tree
Hide file tree
Showing 46 changed files with 3,750 additions and 3,479 deletions.
3 changes: 2 additions & 1 deletion docs/docs/metrics/write-your-own.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ Your implementation should call `writer.write(<object_score>, <object>)` for eve
from loguru import logger

from encord_active.lib.common.iterator import Iterator
from encord_active.lib.metrics.metric import AnnotationType, DataType, Metric, MetricType
from encord_active.lib.metrics.metric import Metric
from encord_active.lib.metrics.types import AnnotationType, DataType, MetricType
from encord_active.lib.metrics.writer import CSVMetricWriter

logger = logger.opt(colors=True)
Expand Down
4 changes: 2 additions & 2 deletions docs/docs/sdk/initialize-a-project.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ from pathlib import Path
from typing import List

from encord_active.lib.metrics.execute import run_metrics
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.project.local import ProjectExistsError, init_local_project

# 1. Choose images to import
Expand Down Expand Up @@ -88,7 +88,7 @@ from pathlib import Path
from typing import List

from encord_active.lib.metrics.execute import run_metrics
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.project.local import ProjectExistsError, init_local_project

# 1. Choose images and label files to import
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/sdk/run-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ There is a utility function you can use to run targeted subsets of metrics:
from encord_active.lib.metrics.execute import (
run_metrics_by_embedding_type,
)
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
run_metrics_by_embedding_type(EmbeddingType.IMAGE, **options)
run_metrics_by_embedding_type(EmbeddingType.OBJECT, **options)
Expand Down
6 changes: 4 additions & 2 deletions examples/building-a-custom-metric-function.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@
"from typing import List, Optional, Union\n",
"\n",
"from encord_active.lib.common.iterator import Iterator\n",
"from encord_active.lib.metrics.metric import AnnotationType, DataType, MetricType, Metric\n",
"from encord_active.lib.metrics.metric import Metric\n",
"from encord_active.lib.metrics.types import AnnotationType, DataType, MetricType\n",
"from encord_active.lib.metrics.writer import CSVMetricWriter\n",
"\n",
"class ExampleMetric(Metric):\n",
Expand Down Expand Up @@ -442,7 +443,8 @@
"import numpy as np\n",
"from encord_active.lib.common import utils\n",
"from encord_active.lib.common.iterator import Iterator\n",
"from encord_active.lib.metrics.metric import AnnotationType, DataType, Metric, MetricType\n",
"from encord_active.lib.metrics.metric import Metric\n",
"from encord_active.lib.metrics.types import AnnotationType, DataType, MetricType\n",
"from encord_active.lib.metrics.writer import CSVMetricWriter\n",
"from loguru import logger\n",
"\n",
Expand Down
6,419 changes: 3,297 additions & 3,122 deletions poetry.lock

Large diffs are not rendered by default.

59 changes: 35 additions & 24 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,39 @@ name = "encord-active"
version = "v0.1.57"
description = "Enable users to improve machine learning models in an active learning fashion via data, label, and model quality."
authors = ["Cord Technologies Limited <[email protected]>"]
classifiers=[
"Environment :: Console",
"Environment :: Web Environment",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Software Development",
"Topic :: Software Development :: Quality Assurance"
classifiers = [
"Environment :: Console",
"Environment :: Web Environment",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Software Development",
"Topic :: Software Development :: Quality Assurance",
]
documentation = "https://docs.encord.com/active/docs"
homepage = "https://encord.com/encord-active/"
keywords = ["encord", "active", "machine", "learning", "data", "label", "model", "quality", "test"]
keywords = [
"encord",
"active",
"machine",
"learning",
"data",
"label",
"model",
"quality",
"test",
]
readme = "README.md"
repository = "https://github.com/encord-team/encord-active"

include = ['.env']

packages = [
{ include = "encord_active", from = "src" },
]
packages = [{ include = "encord_active", from = "src" }]

license = "Apache-2.0"

Expand All @@ -44,8 +52,6 @@ natsort = "^8.1.0"
pandas = "^1.4.3"
shapely = "^1.7.0"
watchdog = "^2.1.9"
torch = "^1.12.1"
torchvision = "^0.13.1"
faiss-cpu = "^1.7.2"
matplotlib = "^3.5.3"
scikit-learn = "^1.0.1"
Expand All @@ -63,23 +69,28 @@ rich = "^12.6.0"
PyYAML = "^6.0"
toml = "^0.10.2"
pydantic = "^1.10.2"
pycocotools = {version = "^2.0.6", optional = true}
pycocotools = { version = "^2.0.6", optional = true }
psutil = "^5.9.4"
pandera = "^0.13.4"
jupyterlab = {version = "^3.5.2", optional = true}
ipywidgets = {version = "^8.0.4", optional = true}
jupyterlab = { version = "^3.5.2", optional = true }
ipywidgets = { version = "^8.0.4", optional = true }
inquirerpy = "^0.3.4"
statsmodels = "^0.13.5"
umap-learn = "^0.5.3"
streamlit-plotly-events = "^0.0.6"
encord-active-components = "^0.0.12"
llvmlite = "^0.39.1" # Pinning, as lower versions conflict with other libs
llvmlite = "^0.39.1" # Pinning, as lower versions conflict with other libs
gitpython = "^3.1.31"
prisma = "^0.8.2"
fastapi = "^0.95.0"
uvicorn = {extras = ["standard"], version = "^0.21.1"}
uvicorn = { extras = ["standard"], version = "^0.21.1" }
nodejs-bin = "^18.4.0a4"
pyjwt = "^2.7.0"
torch = "^2.0.0"
clip = { git = "https://github.com/openai/CLIP.git" }
torchvision = "^0.15.2"
ftfy = "^6.1.1"
regex = "^2023.5.5"

[tool.poetry.extras]
coco = ["pycocotools"]
Expand Down
2 changes: 1 addition & 1 deletion src/encord_active/app/common/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from encord_active.lib.db.connection import DBConnection
from encord_active.lib.db.merged_metrics import MergedMetrics, initialize_merged_metrics
from encord_active.lib.embeddings.utils import Embedding2DSchema
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.metrics.utils import MetricData, MetricSchema
from encord_active.lib.model_predictions.reader import LabelSchema, OntologyObjectJSON
from encord_active.lib.model_predictions.writer import OntologyClassificationJSON
Expand Down
2 changes: 1 addition & 1 deletion src/encord_active/app/label_onboarding/label_onboarding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
execute_metrics,
get_metrics_by_embedding_type,
)
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.project.project import Project


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from encord_active.lib.charts.scopes import PredictionMatchScope
from encord_active.lib.embeddings.dimensionality_reduction import get_2d_embedding_data
from encord_active.lib.embeddings.utils import Embedding2DSchema
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.metrics.utils import MetricSchema
from encord_active.lib.model_predictions.classification_metrics import (
match_predictions_and_labels,
Expand Down Expand Up @@ -253,7 +253,7 @@ def render_explorer(self):

if EmbeddingType.IMAGE not in get_state().reduced_embeddings:
get_state().reduced_embeddings[EmbeddingType.IMAGE] = get_2d_embedding_data(
get_state().project_paths.embeddings, EmbeddingType.IMAGE
get_state().project_paths, EmbeddingType.IMAGE
)

metric_name = get_state().predictions.metric_datas_classification.selected_prediction
Expand Down
2 changes: 1 addition & 1 deletion src/encord_active/app/projects_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
try_find_parent_project,
)
from encord_active.lib.common.image_utils import show_image_and_draw_polygons
from encord_active.lib.metrics.metric import AnnotationType
from encord_active.lib.metrics.types import AnnotationType
from encord_active.lib.metrics.utils import load_metric_metadata
from encord_active.lib.model_predictions.writer import (
iterate_classification_attribute_options,
Expand Down
2 changes: 1 addition & 1 deletion src/encord_active/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def import_local_project(
run_metrics_by_embedding_type,
)
from encord_active.lib.metrics.heuristic.img_features import AreaMetric
from encord_active.lib.metrics.metric import EmbeddingType
from encord_active.lib.metrics.types import EmbeddingType
from encord_active.lib.project.local import (
NoFilesFoundError,
ProjectExistsError,
Expand Down
2 changes: 1 addition & 1 deletion src/encord_active/lib/db/merged_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from encord_active.lib.db.connection import DBConnection
from encord_active.lib.db.tags import Tag, TagScope
from encord_active.lib.labels.classification import ClassificationType
from encord_active.lib.metrics.metric import DataType, EmbeddingType
from encord_active.lib.metrics.types import DataType, EmbeddingType
from encord_active.lib.metrics.utils import load_metric_metadata
from encord_active.lib.project.project_file_structure import ProjectFileStructure

Expand Down
12 changes: 6 additions & 6 deletions src/encord_active/lib/embeddings/dimensionality_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from pandera.typing import DataFrame

from encord_active.lib.embeddings.utils import (
EMBEDDING_REDUCED_TO_FILENAME,
Embedding2DSchema,
EmbeddingType,
load_collections,
)
from encord_active.lib.project.project_file_structure import ProjectFileStructure

warnings.filterwarnings("ignore", "n_neighbors is larger than the dataset size", category=UserWarning)
MIN_SAMPLES = 4 # The number 4 is experimentally determined, less than this creates error for UMAP calculation
Expand All @@ -23,8 +23,9 @@ def generate_2d_embedding_data(embedding_type: EmbeddingType, project_dir: Path)
"""
This function transforms high dimensional embedding data to 2D and saves it to a file
"""
pfs = ProjectFileStructure(project_dir)

collections = load_collections(embedding_type, project_dir / "embeddings")
collections = load_collections(embedding_type, pfs.embeddings)
if not collections:
return

Expand Down Expand Up @@ -62,15 +63,14 @@ def generate_2d_embedding_data(embedding_type: EmbeddingType, project_dir: Path)
embeddings_2d_collection["x"].append(embeddings_2d[counter, 0])
embeddings_2d_collection["y"].append(embeddings_2d[counter, 1])

target_path = Path(project_dir / "embeddings" / EMBEDDING_REDUCED_TO_FILENAME[embedding_type])
target_path = pfs.get_embeddings_file(embedding_type, reduced=True)
target_path.write_bytes(pickle.dumps(embeddings_2d_collection))


def get_2d_embedding_data(
embeddings_path: Path, embedding_type: EmbeddingType
project_file_structure: ProjectFileStructure, embedding_type: EmbeddingType
) -> Optional[DataFrame[Embedding2DSchema]]:

embedding_file_path = embeddings_path / EMBEDDING_REDUCED_TO_FILENAME[embedding_type]
embedding_file_path = project_file_structure.get_embeddings_file(embedding_type, reduced=True)

if not embedding_file_path.exists():
return None
Expand Down
Loading

0 comments on commit b91e255

Please sign in to comment.