Skip to content
This repository has been archived by the owner on Nov 6, 2023. It is now read-only.

Commit

Permalink
Clickhouse/fix/lexical parsing (#240)
Browse files Browse the repository at this point in the history
* fix(clickhouse): datetime lexical parsing

* fix(clickhouse): aggfunctions lexical parsing
  • Loading branch information
Vixtir authored Oct 20, 2023
1 parent f61ac36 commit a5fe42f
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 23 deletions.
31 changes: 17 additions & 14 deletions odd_collector/adapters/clickhouse/adapter.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,40 @@
from typing import List

from odd_collector_sdk.domain.adapter import AbstractAdapter
from odd_collector_sdk.domain.adapter import BaseAdapter
from odd_models.models import DataEntity, DataEntityList
from oddrn_generator import ClickHouseGenerator
from oddrn_generator import ClickHouseGenerator, Generator

from ...domain.plugin import ClickhousePlugin
from .mappers.tables import map_table
from .repository import ClickHouseRepository


class Adapter(AbstractAdapter):
class Adapter(BaseAdapter):
generator: ClickHouseGenerator
config: ClickhousePlugin

def __init__(self, config: ClickhousePlugin) -> None:
self.__db = config.database
self.clickhouse_repository = ClickHouseRepository(config)
self.__oddrn_generator = ClickHouseGenerator(
host_settings=f"{config.host}", databases=config.database
)
super().__init__(config)
self.db = config.database
self.repository = ClickHouseRepository(config)

def get_data_source_oddrn(self) -> str:
return self.__oddrn_generator.get_data_source_oddrn()
def create_generator(self) -> Generator:
return ClickHouseGenerator(
host_settings=f"{self.config.host}", databases=self.config.database
)

def get_data_entities(self) -> List[DataEntity]:
records = self.clickhouse_repository.get_records()
records = self.repository.get_records()
return map_table(
self.__oddrn_generator,
self.generator,
records.tables,
records.columns,
records.integration_engines,
self.__db,
self.db,
)

def get_data_entity_list(self) -> DataEntityList:
return DataEntityList(
data_source_oddrn=self.get_data_source_oddrn(),
items=(self.get_data_entities()),
items=self.get_data_entities(),
)
43 changes: 43 additions & 0 deletions odd_collector/adapters/clickhouse/grammar_parser/column_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,49 @@ def __repr__(self) -> str:
return f"BasicType({self.type_name})"


class AggregateFunction(ParseType):
def __init__(self, type_name: str, params: str):
self.type_name = type_name
self.params = params

def to_clickhouse_type(self) -> str:
return f"{self.type_name}{self.params})"

def __repr__(self) -> str:
return f"AggregateFunction({self.type_name})"


class LowCardinality(ParseType):
def __init__(self, type_name: str, params: str):
self.type_name = type_name
self.params = params

def to_clickhouse_type(self) -> str:
return f"{self.type_name}{self.params})"

def __repr__(self) -> str:
return f"LowCardinality({self.type_name})"


class DateTime(ParseType):
def __init__(self, type_name: str, time_zone: str):
self.type_name = type_name
self.time_zone = time_zone

def to_clickhouse_type(self) -> str:
return f"{self.type_name}({self.time_zone})"


class DateTime64(ParseType):
def __init__(self, type_name: str, digit, time_zone: str):
self.type_name = type_name
self.digit = digit
self.time_zone = time_zone

def to_clickhouse_type(self) -> str:
return f"{self.type_name}({self.digit}, {self.time_zone})"


class Array(ParseType):
def __init__(self, type: ParseType):
self.type = type
Expand Down
33 changes: 30 additions & 3 deletions odd_collector/adapters/clickhouse/grammar_parser/filed_types.lark
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
?start: type

?type: nested
| aggregate
| array
| named_tuple
| tuple
| map
| low_cardinality
| date
| date32
| datetime
| datetime64
| BASIC_TYPE

BASIC_TYPE: "UInt64"
Expand All @@ -20,24 +26,45 @@ BASIC_TYPE: "UInt64"
| "Nullable"
| "UUID"
| "Enum"
| "Low Cardinality"
| "Bool"
| "Date"

array: "Array" "(" type ")"

aggregate: "AggregateFunction" /\((.*)\)/

params: (", " BASIC_TYPE)*
low_cardinality: "LowCardinality" /\((.*)\)/


uniq: "uniq"
anyif: "anyIf"
quantiles: "quantiles" "(" NUMBER "," WS NUMBER ")"

functions: uniq | anyif | quantiles

tuple: "Tuple" "(" type ("," WS type)* ")"

named_tuple: "Tuple" "(" field ("," WS field)* ")"

nested: "Nested" "(" field ("," WS field)* ")"

map: "Map" "(" type "," WS type ")"

field: FIELD_NAME WS type

date: "Date"

date32: "Date32"

datetime: "DateTime" "(" TIMEZONE ")"

datetime64: "DateTime64" "(" DIGIT "," WS TIMEZONE ")"

FIELD_NAME: (LETTER | "_") (LETTER | "_" | DIGIT | "-")*
TIMEZONE: "'" (LETTER | "/" )* "'"
fn_args: /.*/

%import common.WS
%import common.LETTER
%import common.NUMBER
%import common.DIGIT
%import common.WORD
20 changes: 19 additions & 1 deletion odd_collector/adapters/clickhouse/grammar_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

from ..logger import logger
from .column_type import (
AggregateFunction,
Array,
BasicType,
DateTime,
DateTime64,
Field,
LowCardinality,
Map,
NamedTuple,
Nested,
Expand All @@ -23,13 +27,27 @@
LARL does not support the different types of Tuples presented in filed_types.lark
"""


parser = Lark.open("filed_types.lark", rel_to=__file__, parser="earley", start="type")


def traverse_tree(node) -> Union[ParseType, str, Field, None]:
logger.debug(f"Node: {node}")

if isinstance(node, Tree):
if node.data == "aggregate":
return AggregateFunction("AggregateFunction", node.children[0].value)
if node.data == "low_cardinality":
return LowCardinality("LowCardinality", node.children[0].value)
if node.data == "date":
return BasicType("Date")
if node.data == "date32":
return BasicType("Date32")
if node.data == "datetime":
return DateTime("DateTime", node.children[0].value)
if node.data == "datetime64":
return DateTime64(
"DateTime64", node.children[0].value, node.children[2].value
)
if node.data == "array":
if len(node.children) != 1:
raise StructureError(
Expand Down
13 changes: 10 additions & 3 deletions odd_collector/adapters/clickhouse/mappers/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@

from ..domain import Column
from ..grammar_parser.column_type import (
AggregateFunction,
Array,
BasicType,
DateTime,
DateTime64,
LowCardinality,
Map,
NamedTuple,
Nested,
Expand All @@ -19,7 +23,7 @@


def build_dataset_fields(
columns: List[Column], oddrn_generator: ClickHouseGenerator, table_oddrn_path: str
columns: list[Column], oddrn_generator: ClickHouseGenerator, table_oddrn_path: str
) -> List[DataSetField]:
generated_dataset_fields = []
ds_fields_oddrn = {}
Expand Down Expand Up @@ -143,14 +147,17 @@ def _build_dataset_fields(
return generated_dataset_fields


def type_to_oddrn_type(column_type):
def type_to_oddrn_type(column_type: ParseType) -> Type:
if isinstance(column_type, Array):
return Type.TYPE_LIST
elif isinstance(column_type, Nested):
return Type.TYPE_STRUCT
elif isinstance(column_type, Map):
return Type.TYPE_MAP
elif isinstance(column_type, BasicType):
elif isinstance(
column_type,
(BasicType, DateTime, DateTime64, LowCardinality, AggregateFunction),
):
return TYPES_SQL_TO_ODD.get(column_type.type_name, Type.TYPE_UNKNOWN)
elif isinstance(column_type, str):
return TYPES_SQL_TO_ODD.get(column_type, Type.TYPE_UNKNOWN)
Expand Down
3 changes: 3 additions & 0 deletions odd_collector/adapters/clickhouse/mappers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

TYPES_SQL_TO_ODD = {
"Date": Type.TYPE_DATETIME,
"Date32": Type.TYPE_DATETIME,
"DateTime": Type.TYPE_DATETIME,
"DateTime64": Type.TYPE_DATETIME,
"String": Type.TYPE_STRING,
Expand All @@ -25,4 +26,6 @@
"Map": Type.TYPE_MAP,
"Tuple": Type.TYPE_STRUCT,
"Bool": Type.TYPE_BOOLEAN,
"LowCardinality": Type.TYPE_UNION,
"AggregateFunction": Type.TYPE_UNKNOWN,
}
16 changes: 14 additions & 2 deletions tests/integration/test_clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ def find_by_type(

create_databse = "CREATE DATABASE IF NOT EXISTS my_database"
create_table = """
CREATE TABLE my_database.test (a Date, b UInt64, c Nested(d String, e String, f Nested(g String))) ENGINE = MergeTree ORDER BY (a, b)
CREATE TABLE my_database.test (
a Date,
b UInt64,
c Nested(d String, e String, f Nested(g String)),
d Date32,
e DateTime('UTC'),
f DateTime64(3, 'UTC'),
c_uuid UUID,
c_agg AggregateFunction(uniq, UInt64),
c_low_cardinality LowCardinality(String)
) ENGINE = MergeTree ORDER BY (a, b)
"""


Expand Down Expand Up @@ -63,4 +73,6 @@ def test_clickhouse():
tables = find_by_type(data_entities, DataEntityType.TABLE)
assert len(tables) == 1
table = tables[0]
assert len(table.dataset.field_list) == 7
assert len(table.dataset.field_list) == 13

assert data_entities.json()

0 comments on commit a5fe42f

Please sign in to comment.