diff --git a/poetry.lock b/poetry.lock index 746396745..3bf4a7041 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiofiles" @@ -828,17 +828,18 @@ files = [ [[package]] name = "encord" -version = "0.1.83" +version = "0.1.98" description = "Encord Python SDK Client" optional = false -python-versions = ">=3.7,<4.0" +python-versions = ">=3.8,<4.0" files = [ - {file = "encord-0.1.83-py3-none-any.whl", hash = "sha256:c3116c75f60f413ec552024a18755524cc4c7d5b6b774fce883c680c0d747316"}, - {file = "encord-0.1.83.tar.gz", hash = "sha256:7a923c47ca21cf3980ef618ff049383d61ecc82641671fabececc9ba0a32e46c"}, + {file = "encord-0.1.98-py3-none-any.whl", hash = "sha256:e9b1758c8ad4803e82af61bc6644ab60fad52649afea5eea18f51a04ea1f8f95"}, + {file = "encord-0.1.98.tar.gz", hash = "sha256:f2b8447daa270000ff3e401eda6acf427b13b37471bd782c4b87ad4f7d5de60f"}, ] [package.dependencies] cryptography = ">=3.4.8" +pydantic = ">=1.7.0" python-dateutil = ">=2.8.2,<3.0.0" requests = ">=2.25.0,<3.0.0" tqdm = ">=4.32.1,<5.0.0" @@ -969,7 +970,6 @@ files = [ {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"}, {file = "greenlet-2.0.2-cp27-cp27m-win_amd64.whl", hash = "sha256:283737e0da3f08bd637b5ad058507e578dd462db259f7f6e4c5c365ba4ee9343"}, {file = "greenlet-2.0.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d27ec7509b9c18b6d73f2f5ede2622441de812e7b1a80bbd446cb0633bd3d5ae"}, - {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d967650d3f56af314b72df7089d96cda1083a7fc2da05b375d2bc48c82ab3f3c"}, {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:30bcf80dda7f15ac77ba5af2b961bdd9dbc77fd4ac6105cee85b0d0a5fcf74df"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26fbfce90728d82bc9e6c38ea4d038cba20b7faf8a0ca53a9c07b67318d46088"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9190f09060ea4debddd24665d6804b995a9c122ef5917ab26e1566dcc712ceeb"}, @@ -978,7 +978,6 @@ files = [ {file = "greenlet-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:76ae285c8104046b3a7f06b42f29c7b73f77683df18c49ab5af7983994c2dd91"}, {file = "greenlet-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:2d4686f195e32d36b4d7cf2d166857dbd0ee9f3d20ae349b6bf8afc8485b3645"}, {file = "greenlet-2.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4302695ad8027363e96311df24ee28978162cdcdd2006476c43970b384a244c"}, - {file = "greenlet-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d4606a527e30548153be1a9f155f4e283d109ffba663a15856089fb55f933e47"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c48f54ef8e05f04d6eff74b8233f6063cb1ed960243eacc474ee73a2ea8573ca"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1846f1b999e78e13837c93c778dcfc3365902cfb8d1bdb7dd73ead37059f0d0"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a06ad5312349fec0ab944664b01d26f8d1f05009566339ac6f63f56589bc1a2"}, @@ -1008,7 +1007,6 @@ files = [ {file = "greenlet-2.0.2-cp37-cp37m-win32.whl", hash = "sha256:3f6ea9bd35eb450837a3d80e77b517ea5bc56b4647f5502cd28de13675ee12f7"}, {file = "greenlet-2.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:7492e2b7bd7c9b9916388d9df23fa49d9b88ac0640db0a5b4ecc2b653bf451e3"}, {file = "greenlet-2.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b864ba53912b6c3ab6bcb2beb19f19edd01a6bfcbdfe1f37ddd1778abfe75a30"}, - {file = "greenlet-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1087300cf9700bbf455b1b97e24db18f2f77b55302a68272c56209d5587c12d1"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba2956617f1c42598a308a84c6cf021a90ff3862eddafd20c3333d50f0edb45b"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3a569657468b6f3fb60587e48356fe512c1754ca05a564f11366ac9e306526"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8eab883b3b2a38cc1e050819ef06a7e6344d4a990d24d45bc6f2cf959045a45b"}, @@ -1017,7 +1015,6 @@ files = [ {file = "greenlet-2.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0ef99cdbe2b682b9ccbb964743a6aca37905fda5e0452e5ee239b1654d37f2a"}, {file = "greenlet-2.0.2-cp38-cp38-win32.whl", hash = "sha256:b80f600eddddce72320dbbc8e3784d16bd3fb7b517e82476d8da921f27d4b249"}, {file = "greenlet-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:4d2e11331fc0c02b6e84b0d28ece3a36e0548ee1a1ce9ddde03752d9b79bba40"}, - {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8512a0c38cfd4e66a858ddd1b17705587900dd760c6003998e9472b77b56d417"}, {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:88d9ab96491d38a5ab7c56dd7a3cc37d83336ecc564e4e8816dbed12e5aaefc8"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:561091a7be172ab497a3527602d467e2b3fbe75f9e783d8b8ce403fa414f71a6"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:971ce5e14dc5e73715755d0ca2975ac88cfdaefcaab078a284fea6cfabf866df"}, @@ -2548,10 +2545,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.2", markers = "python_version >= \"3.10\" or python_version >= \"3.6\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, - {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" or python_version >= \"3.9\""}, - {version = ">=1.14.5", markers = "python_version >= \"3.7\""}, - {version = ">=1.17.3", markers = "python_version >= \"3.8\""}, + {version = ">=1.21.2", markers = "python_version >= \"3.10\" or python_version >= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, + {version = ">=1.19.3", markers = "python_version >= \"3.8\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version < \"3.10\" or python_version >= \"3.9\" and python_version < \"3.10\" and platform_system != \"Darwin\" or python_version >= \"3.9\" and python_version < \"3.10\" and platform_machine != \"arm64\""}, ] [[package]] @@ -2659,7 +2654,7 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.1" @@ -4066,7 +4061,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")"} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -5155,4 +5150,4 @@ notebooks = ["ipywidgets", "jupyterlab"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.9.7 || >3.9.7,<3.12" -content-hash = "b2e962857f73a5f3cfaac8dc8d2889626c0b673cdb377ebeef39405950f9b4db" +content-hash = "45f69595f91b056dd4192b35fe160e6b93e7e47abba904d7a08ee3e081f05eca" diff --git a/pyproject.toml b/pyproject.toml index 8787d3c89..e8ba1fad9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ encord-active = "encord_active.cli.main:cli" [tool.poetry.dependencies] python = ">=3.9,<3.9.7 || >3.9.7,<3.12" -encord = "^0.1.83" +encord = "^0.1.95" numpy = ">=1.23.5,<1.24.0" opencv-python = "4.5.5.64" natsort = "^8.1.0" diff --git a/src/encord_active/public/dataset.py b/src/encord_active/public/dataset.py new file mode 100644 index 000000000..3311802d5 --- /dev/null +++ b/src/encord_active/public/dataset.py @@ -0,0 +1,384 @@ +from pathlib import Path +from typing import Optional, Union +from uuid import UUID + +from encord.objects import Object, OntologyStructure, RadioAttribute +from PIL import Image +from sqlalchemy.sql.operators import in_op +from sqlmodel import Session, select +from torch.utils.data import Dataset + +from encord_active.db.models import ( + Project, + ProjectDataMetadata, + ProjectDataUnitMetadata, + ProjectTag, + ProjectTaggedDataUnit, + get_engine, +) +from encord_active.lib.common.data_utils import url_to_file_path + +P = Project +T = ProjectTaggedDataUnit +D = ProjectDataUnitMetadata +L = ProjectDataMetadata + + +class ActiveDataset(Dataset): + def __init__( + self, + database_path: Path, + project_hash: Union[set[str], str], + tag_name: Optional[Union[set[str], str]] = None, + ontology_hashes: Optional[list[str]] = None, + transform=None, + target_transform=None, + ): + database_path = database_path.expanduser().resolve() + if not database_path.is_file(): + raise FileNotFoundError(f"The database file does not exist at the specified path: '{database_path}'") + self.root_path = database_path.parent + self.engine = get_engine(database_path, use_alembic=False) + self.project_hash = {UUID(project_hash)} if isinstance(project_hash, str) else set(map(UUID, project_hash)) + self.tag_name = {tag_name} if isinstance(tag_name, str) else tag_name + self.ontology_hashes = ontology_hashes + + self.identifiers: list[tuple[UUID, int]] = [] + + self.setup() + + self.transform = transform + self.target_transform = target_transform + + def get_identifier_query(self, sess: Session): + identifier_query = select(D.du_hash, D.frame) + if self.tag_name is not None: + in_uids = set() + in_names = set() + for name in self.tag_name: + try: + in_uids.add(UUID(name)) + except ValueError: + in_names.add(name) + + where_clauses = [ + in_op(col, vals) for col, vals in [(ProjectTag.tag_hash, in_uids), (ProjectTag.name, in_names)] if vals + ] + tag_query = select(ProjectTag.tag_hash).where( + in_op(ProjectTag.project_hash, self.project_hash), *where_clauses + ) + tag_hash = set(sess.exec(tag_query).all()) + + if tag_hash is None: + valid_tag_names = sess.exec( + select(ProjectTag.name).where(in_op(ProjectTag.project_hash, self.project_hash)) + ).all() + raise ValueError( + f"Couldn't find a data tag with either name or tag_hash `{self.tag_name}`. Valid tags for the specified project(s) are {valid_tag_names}." + ) + + identifier_query = identifier_query.join( + T, onclause=((T.du_hash == D.du_hash) & (T.frame == D.frame)) + ).where( + in_op(D.project_hash, self.project_hash), + in_op(T.project_hash, self.project_hash), + in_op(T.tag_hash, tag_hash), + ) + else: + identifier_query = identifier_query.where(in_op(D.project_hash, self.project_hash)) + + return identifier_query + + def setup(self): + with Session(self.engine) as sess: + # Check that data is available locally + identifier_query = self.get_identifier_query(sess) + probe = sess.exec(identifier_query.add_columns(D.data_uri).limit(1)).first() + if ( + probe is not None + and probe[-1] is not None + and url_to_file_path(probe[-1], self.root_path) is None # type: ignore + ): + raise ValueError("Couldn't find data locally. Please execute `encord-active download-data` first.") + + # Check for videos + video_probe = sess.exec(identifier_query.where(D.data_uri_is_video).limit(1)).first() + if video_probe is not None: + raise ValueError("Dataset contains videos. This is currently not supported for this dataloader.") + + # Load and validate ontology + if len(self.project_hash) > 1: + ontologies = list(map(OntologyStructure.from_dict, sess.exec(select(P.project_ontology)).all())) # type: ignore + first, *rest = [ + tuple( + [o.feature_node_hash for o in ont.objects] + [c.feature_node_hash for c in ont.classifications] + ) + for ont in ontologies + ] + assert all( + [first == next_ for next_ in rest] + ), "Ontologies must match if you select multiple projects at once" + + ontology_dict = sess.exec( + select(P.project_ontology).where(in_op(P.project_hash, self.project_hash)).limit(1) + ).first() + if ontology_dict is None: + raise ValueError("Couldn't read project ontology") + self.ontology = OntologyStructure.from_dict(ontology_dict) # type: ignore + + def __len__(self): + with Session(self.engine) as sess: + return sess.query(self.get_identifier_query(sess)).count() + + def __getitem__(self, idx): + ... + + +class ActiveClassificationDataset(ActiveDataset): + def __init__( + self, + database_path: Path, + project_hash: Union[set[str], str], + tag_name: Optional[Union[set[str], str]] = None, + ontology_hashes: Optional[list[str]] = None, + transform=None, + target_transform=None, + ): + """ + A dataset hooked up to an Encord Active database. + The dataset can filter (image) data from Encord Active based on both `project_hash`es + and `tag_hash`/`tag_name`s. For example, if you have added a tag called + "train" within Encord Active, you can use that tag name by setting + `tag_name="train"` option. You can also combine multiple tags into one + dataset by providing a set of names. Similarly, you can use multiple + projects if they share the same ontology. Just provide the relevant + project hashes. + + Note: that this dataset requires that you have downloaded the data locally. + This can be done with `encord-active project download-data`. + + ⚠️ Videos are not yet supported. + + Args: + database_path: Path to where the `encord_active.sqlite` database lives + on your system. + project_hash: The project hash (or set of hashes) of the project(s) + to load data from. + tag_name: tag names (or hashes) for the tags you want to include. + If no tags are specified, all images with labels from the project + will be included. + ontology_hashes: The `feature_node_hash` of the radio button classification + question used for labels. The first radiobutton within that + classification will be used. + transform (): Data transform applied to PIL images + target_transform (): Target transform applied to the uint label tensor. + """ + assert ( + ontology_hashes is None or len(ontology_hashes) == 1 + ), "Either don't define ontology hashes to use first radio button in ontology or specify the feature node hash of the classification you want." + + super().__init__( + database_path, + project_hash, + tag_name, + ontology_hashes, + transform, + target_transform, + ) + + def setup(self): + super().setup() + + with Session(self.engine) as sess: + identifier_query = self.get_identifier_query(sess) + identifier_query = identifier_query.join(L, onclause=(L.data_hash == D.data_hash)).where( + in_op(L.project_hash, self.project_hash) + ) + identifier_query = identifier_query.add_columns(D.data_uri, D.classifications, L.label_row_json) + + identifiers = sess.exec(identifier_query).all() + ontology_pairs = [ + (c, a) + for c in self.ontology.classifications + for a in c.attributes + if isinstance(a, RadioAttribute) + and ((not self.ontology_hashes) or c.feature_node_hash in self.ontology_hashes) + ] + if len(ontology_pairs) == 0: + raise ValueError("No ontology classifications were found to use for labels") + classification, attribute = ontology_pairs[0] + indices = {o.feature_node_hash: i for i, o in enumerate(attribute.options)} + self.class_names = [o.title for o in attribute.options] + + self.uris = [] + self.labels = [] + for ( # type: ignore + *_, + data_uri, + classifications, + label_row_json, + ) in identifiers: + classification_answers = label_row_json["classification_answers"] + + clf_instance = next( + (c for c in classifications if c["featureHash"] == classification.feature_node_hash), + None, + ) + if clf_instance is None: + continue + clf_hash = clf_instance["classificationHash"] + clf_classifications = classification_answers[clf_hash]["classifications"] + clf_answers = next( + (a for a in clf_classifications if a["featureHash"] == attribute.feature_node_hash), + None, + ) + if clf_answers is None: + continue + clf_opt = next( + (o for o in clf_answers["answers"] if o["featureHash"] in indices), + None, + ) + if clf_opt is None: + continue + + self.uris.append(url_to_file_path(data_uri, self.root_path)) # type: ignore + self.labels.append(indices[clf_opt["featureHash"]]) + + def __getitem__(self, idx): + data_uri = self.uris[idx] + + img = Image.open(data_uri) + label = self.labels[idx] + + if self.transform: + img = self.transform(img) + + if self.target_transform: + label = self.target_transform(label) + + return img, label + + def __len__(self): + return len(self.labels) + + +class ActiveObjectDataset(ActiveDataset): + def __init__( + self, + database_path: Path, + project_hash: Union[set[str], str], + tag_name: Optional[Union[set[str], str]] = None, + ontology_hashes: Optional[list[str]] = None, + transform=None, + target_transform=None, + ): + """ + A dataset hooked up to an Encord Active database. + The dataset can filter (image) data from Encord Active based on both `project_hash`es + and `tag_hash`/`tag_name`s. For example, if you have added a tag called + "train" within Encord Active, you can use that tag name by setting + `tag_name="train"` option. You can also combine multiple tags into one + dataset by providing a set of names. Similarly, you can use multiple + projects if they share the same ontology. Just provide the relevant + project hashes. + + Note: This dataset requires that you have downloaded the data locally. + This can be done with `encord-active project download-data`. + + ⚠️ Videos are not yet supported. + + Args: + database_path: Path to where the `encord_active.sqlite` database lives + on your system. + project_hash: The project hash (or set of hashes) of the project(s) + to load data from. + tag_name: tag names (or hashes) for the tags you want to include. + If no tags are specified, all images with labels from the project + will be included. + ontology_hashes: The `feature_node_hash` of the objects used for labels. + transform (): Data transform applied to PIL images + target_transform (): Target transform applied to the uint label tensor. # TODO update the type + """ + super().__init__( + database_path, + project_hash, + tag_name, + ontology_hashes, + transform, + target_transform, + ) + + def setup(self): + super().setup() + + with Session(self.engine) as sess: + identifier_query = self.get_identifier_query(sess) + identifier_query = identifier_query.join(L, onclause=(L.data_hash == D.data_hash)).where( + in_op(L.project_hash, self.project_hash) + ) + identifier_query = identifier_query.add_columns(D.data_uri, D.objects, L.label_row_json) + identifiers = sess.exec(identifier_query).all() + + feature_hash_to_ontology_object: dict[str, Object] = { + o.feature_node_hash: o + for o in self.ontology.objects + if (self.ontology_hashes is None or (o.feature_node_hash in self.ontology_hashes)) + } + + if self.ontology_hashes is not None and len(feature_hash_to_ontology_object) != len(self.ontology_hashes): + missing_feature_hashes = set(self.ontology_hashes).difference(set(feature_hash_to_ontology_object)) + error_log = [] + for feature_hash in missing_feature_hashes: + ontology_object = feature_hash_to_ontology_object.get(feature_hash) + shape = "UNKNOWN" if ontology_object is None else ontology_object.shape.name + error_log.append(f"{(ontology_object and ontology_object.title) or feature_hash}: {shape}") + raise ValueError( + "Mismatch between objects with specified `ontology_hashes` and supported shapes: " + + ", ".join(error_log) + ) + if len(feature_hash_to_ontology_object) == 0: + raise ValueError("No ontology objects with supported shapes were found to use for labels") + + self.class_names = [o.title for o in feature_hash_to_ontology_object.values()] + + self.data_unit_paths: list[Path] = [] + self.labels_per_data_unit: list[list[dict]] = [] + self.label_attributes_per_data_unit: list[dict] = [] + for ( # type: ignore + *_, + data_uri, + all_objects, + label_row_json, + ) in identifiers: + object_hash_to_object = { + o["objectHash"]: o for o in all_objects if o["featureHash"] in feature_hash_to_ontology_object + } + object_attributes = { + k: v for k, v in label_row_json["object_answers"].items() if k in object_hash_to_object + } + + data_unit_path = url_to_file_path(data_uri, self.root_path) + if data_unit_path is None: + # Skip file as it's missing + continue + + self.data_unit_paths.append(data_unit_path) # TODO check "type: ignore" + self.labels_per_data_unit.append(list(object_hash_to_object.values())) + self.label_attributes_per_data_unit.append(object_attributes) + + def __getitem__(self, idx): + data_unit_path = self.data_unit_paths[idx] + + img = Image.open(data_unit_path) + labels = self.labels_per_data_unit[idx] + + if self.transform: + img = self.transform(img) + + if self.target_transform: + labels = self.target_transform(labels) + + return img, labels + + def __len__(self): + return len(self.data_unit_paths)