Skip to content

Agg() on ID fails #1271

@dmpetrov

Description

@dmpetrov

Description

It fails for any column, for example file.size.

import datachain as dc


def process(idx_list: list[int], files: list[dc.File]):
  idx = idx_list[0]
  jsons = [
    f for f in files
    if f.get_file_ext() == "json"
  ]
  print(f"Batch {idx}: {len(jsons)}/{len(files)}")
  yield idx, len(jsons), len(files)


data = dc.read_storage("gs://mpii-human-pose/")
(
    data
    .mutate(idx=data.c("sys.id") % 13)
    .persist()
    .agg(
        process,
        params=("idx", "file"),
        output={"idx": int, "processed": int, "total": int},
        partition_by="idx",
    )
    .save("batch-id")
)

Output CLI:

/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/data_storage/db_engine.py:56: SAWarning: SELECT statement has a cartesian product between FROM element(s) "aCMSImcSmdLLNIhj" and FROM element "anon_2".  Apply join condition(s) between each element to resolve.
  return statement.compile(dialect=cls.dialect, **kwargs)


Traceback (most recent call last):
  File "/Users/dmitry/src/audio_examples/tmp.py", line 18, in <module>
    .persist()
     ^^^^^^^^^
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/lib/dc/datachain.py", line 534, in persist
    query=self._query.save(project=project, feature_schema=schema)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/query/dataset.py", line 1802, in save
    self.catalog.warehouse.copy_table(dr.get_table(), query.select())
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/data_storage/sqlite.py", line 794, in copy_table
    ids = self.db.execute(select_ids).fetchall()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/data_storage/sqlite.py", line 101, in wrapper
    raise exc
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/data_storage/sqlite.py", line 97, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dmitry/src/audio_examples/.audio/lib/python3.12/site-packages/datachain/data_storage/sqlite.py", line 241, in execute
    result = self.db.execute(*self.compile_to_args(query))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
sqlite3.OperationalError: no such table: aCMSImcSmdLLNIhj

Output Studio:

/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_sqlalchemy/drivers/compilers/sqlcompiler.py:312: SAWarning: SELECT statement has a cartesian product between FROM element(s) "anon_2" and FROM element "YyEJafkdNIGfqJtN".  Apply join condition(s) between each element to resolve.
  from_linter.warn()
Traceback (most recent call last):
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/dbapi/cursor.py", line 111, in execute
    response = execute(
               ^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/client.py", line 382, in execute
    rv = self.process_ordinary_query(
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/client.py", line 580, in process_ordinary_query
    return self.receive_result(with_column_types=with_column_types,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/client.py", line 212, in receive_result
    return result.get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/result.py", line 50, in get_result
    for packet in self.packet_generator:
                  ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/client.py", line 228, in packet_generator
    packet = self.receive_packet()
             ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/local/datachain_venv/python3.12/default/lib/python3.12/site-packages/clickhouse_driver/client.py", line 245, in receive_packet
    raise packet.exception
clickhouse_driver.errors.ServerException: Code: 60.
DB::Exception: Table studio_production_db.YyEJafkdNIGfqJtN does not exist. Stack trace:

0. ./ci/tmp/build/./src/Common/Exception.cpp:112: DB::Exception::Exception(DB::Exception::MessageMasked&&, int, bool) @ 0x000000000d529e08
1. DB::Exception::Exception(PreformattedMessage&&, int) @ 0x0000000009182e3c
2. DB::Exception::Exception<String, String>(int, FormatStringHelperImpl<std::type_identity<String>::type, std::type_identity<String>::type>, String&&, String&&) @ 0x000000000918295c

Version Info


Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions