Skip to content

Commit

Permalink
Merge pull request #25 from DavidStirling/remote-tables
Browse files Browse the repository at this point in the history
Add initial remote table registration implementation
  • Loading branch information
sbesson authored Jan 31, 2025
2 parents 5243bed + 9399e67 commit 3f15348
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 12 deletions.
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,43 @@ These should match the relevant column type. Mapped variables are substituted in

A `variables` map usually isn't needed for simple queries. The basic condition string should automatically get converted to a meaningful type, but when this fails
replacing tricky elements with a variable may help.

### Remote registration [Experimental]

For **OMERO Plus** installations which support TileDB as the OMERO.tables backend
it is possible to register tables in-place in a similar manner to in-place image
imports (otherwise table data is stored in the ManagedRepository).

If you don't know what table backend your OMERO Plus server is using, you
probably don't have this feature available. If you have access to the server
machine you can check by running `omero config get omero.tables.module`,
if the response is `omero_plus.run_tables_pytables_or_tiledb` then tiledb is
available.

This feature is currently in active development. The current version of
omero2pandas can export tables locally in TileDB format to be registered with
OMERO using external tooling.


For this mode to be available extra dependencies must also be installed as follows

```bash
pip install omero2pandas[remote]
```

To activate this mode use `omero2pandas.upload_table` with arguments as
follows:

```python
import omero2pandas
db_path = omero2pandas.upload_table("/path/to/my_data.csv", "Name for table",
local_path="/path/to/mytable.tiledb")
# Returns the path to the created tiledb file
```

Similar to regular table uploads, the input can be a dataframe in memory or a
csv file on disk.

A `remote_path` argument is also available. In future versions this will be
used if the remote table path is different from the server's point of view (e.g.
network drives are mapped at another location).
23 changes: 20 additions & 3 deletions omero2pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# If the file is missing please request a copy by contacting
# [email protected].
import collections
from importlib.util import find_spec
import logging
import os
import sys
Expand All @@ -19,6 +20,10 @@

from omero2pandas.connect import OMEROConnection
from omero2pandas.upload import create_table
if find_spec("tiledb"):
from omero2pandas.remote import register_table
else:
register_table = None

logging.basicConfig(
format="%(asctime)s %(levelname)-7s [%(name)16s] %(message)s",
Expand Down Expand Up @@ -185,7 +190,8 @@ def read_table(file_id=None, annotation_id=None, column_names=(), rows=None,

def upload_table(source, table_name, parent_id=None, parent_type='Image',
links=None, chunk_size=None, omero_connector=None,
server=None, port=4064, username=None, password=None):
server=None, port=4064, username=None, password=None,
local_path=None, remote_path=None):
"""
Upload a pandas dataframe to a new OMERO table.
For the connection, supply either an active client object or server
Expand All @@ -205,6 +211,10 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
:param server: Address of the server
:param port: Port the server runs on (default 4064)
:param username: Username for server login
:param local_path: [TileDB only], construct table at this file path and
register remotely
:param remote_path: [TileDB only], mapping for local_path on the server
(if different from local system)
:param password: Password for server login
:return: File Annotation ID of the new table
"""
Expand All @@ -220,7 +230,7 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
if parent_id is not None:
if (parent_type, parent_id) not in links:
links.append((parent_type, parent_id))
if not links:
if not links and not local_path:
raise ValueError("No OMERO objects to link the table to")
elif not isinstance(links, Iterable):
raise ValueError(f"Links should be an iterable list of "
Expand All @@ -229,7 +239,14 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
port=port, client=omero_connector) as connector:
conn = connector.get_gateway()
conn.SERVICE_OPTS.setOmeroGroup('-1')
ann_id = create_table(source, table_name, links, conn, chunk_size)
if local_path or remote_path:
if not register_table:
raise ValueError("Remote table support is not installed")
ann_id = register_table(source, local_path,
remote_path=remote_path,
chunk_size=chunk_size)
else:
ann_id = create_table(source, table_name, links, conn, chunk_size)
if ann_id is None:
LOGGER.warning("Failed to create OMERO table")
return ann_id
Expand Down
65 changes: 65 additions & 0 deletions omero2pandas/remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# encoding: utf-8
#
# Copyright (c) Glencoe Software, Inc. All rights reserved.
#
# This software is distributed under the terms described by the LICENCE file
# you can find at the root of the distribution bundle.
# If the file is missing please request a copy by contacting
# [email protected].
import logging
from pathlib import Path, PurePosixPath
import time

import pandas as pd
import tiledb
from tqdm.auto import tqdm

LOGGER = logging.getLogger(__name__)

OMERO_TILEDB_VERSION = '3' # Version of the omero table implementation


def register_table(source, local_path, remote_path=None, chunk_size=1000):
LOGGER.info("Registering remote table")
# Default filters from tiledb.from_pandas()
write_path = Path(local_path or remote_path).with_suffix(".tiledb")
# Assume the server will be running on Linux
remote_path = PurePosixPath(
remote_path or local_path).with_suffix(".tiledb")
LOGGER.debug(f"Remote path would be {str(remote_path)}")
if write_path.exists():
raise ValueError(f"Table file {write_path} already exists")
# path.as_uri() exists but mangles any spaces in the path!
write_path = str(write_path)
# Use a default chunk size if not set
chunk_size = chunk_size or 1000
LOGGER.info("Writing data to TileDB")
# Export table
if isinstance(source, (str, Path)):
data_iterator = pd.read_csv(source, chunksize=chunk_size)
total_rows = None
else:
data_iterator = (source.iloc[i:i + chunk_size]
for i in range(0, len(source), chunk_size))
total_rows = len(source)
progress_monitor = tqdm(
desc="Generating TileDB file...", initial=1, dynamic_ncols=True,
total=total_rows,
bar_format='{desc}: {percentage:3.0f}%|{bar}| '
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}')
row_idx = 0
for chunk in data_iterator:
tiledb.from_pandas(write_path, chunk, sparse=True, full_domain=True,
tile=10000, attr_filters=None,
row_start_idx=row_idx, allows_duplicates=False,
mode="append" if row_idx else "ingest")
progress_monitor.update(len(chunk))
row_idx += len(chunk)
progress_monitor.close()
LOGGER.debug("Appending metadata to TileDB")
# Append omero metadata
with tiledb.open(write_path, mode="w") as array:
array.meta['__version'] = OMERO_TILEDB_VERSION
array.meta['__initialized'] = time.time()
LOGGER.info("Table saved successfully")
return write_path
5 changes: 3 additions & 2 deletions omero2pandas/upload.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# encoding: utf-8
#
# Copyright (c) 2023 Glencoe Software, Inc. All rights reserved.
# Copyright (c) Glencoe Software, Inc. All rights reserved.
#
# This software is distributed under the terms described by the LICENCE file
# you can find at the root of the distribution bundle.
Expand All @@ -9,6 +9,7 @@
import logging
import math
import os
from pathlib import Path

import omero
import omero.grid
Expand Down Expand Up @@ -170,7 +171,7 @@ def create_table(source, table_name, links, conn, chunk_size):
bar_format='{desc}: {percentage:3.0f}%|{bar}| '
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}')

if isinstance(source, str):
if isinstance(source, (str, Path)):
assert os.path.exists(source), f"Could not find file {source}"
columns, str_cols, total_rows, chunk_size = generate_omero_columns_csv(
source, chunk_size)
Expand Down
17 changes: 10 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ name = "omero2pandas"
description = "OMERO.tables to pandas bridge"
readme = "README.md"
license = {file = "LICENSE.txt"}
dependencies = [
'omero-py>=5.19.5',
'pandas>2',
'tqdm',
]
requires-python = ">=3.9"
authors = [
{name = "Glencoe Software, Inc.", email="[email protected]"},
]
Expand All @@ -25,10 +19,19 @@ classifiers = [
'Intended Audience :: End Users/Desktop',
'Programming Language :: Python :: 3',
]

requires-python = ">=3.9"
dependencies = [
'omero-py>=5.19.5',
'pandas>2',
'tqdm',
]

[project.optional-dependencies]
token = ["omero-user-token>=0.3.0"]
remote = [
"pyarrow>=19.0.0",
"tiledb>=0.33.2",
]

[project.urls]
github = "https://github.com/glencoesoftware/omero2pandas"
Expand Down

0 comments on commit 3f15348

Please sign in to comment.