From e5b1e35e0d07374117c5aeced7fd0482bbb11c74 Mon Sep 17 00:00:00 2001
From: Kevin Klein <7267523+kklein@users.noreply.github.com>
Date: Fri, 24 Feb 2023 18:09:12 +0100
Subject: [PATCH] Add experimental support for db2. (#107)

* Draft integration of db2.

* Fix date gap.

* Add dependencies.

* Fix capitaliation tests.

* Add case distinction for varchar column.

* Add bash script for local development.

* Add changelog entry.

* Update CHANGELOG.rst

Co-authored-by: Ignacio Vergara Kausel <ivergarakausel@gmail.com>

* Add skip message.

---------

Co-authored-by: Ignacio Vergara Kausel <ivergarakausel@gmail.com>
---
 .github/workflows/ci.yaml                     | 50 +++++++++++++++++++
 CHANGELOG.rst                                 | 10 +++-
 environment.yml                               |  2 +
 src/datajudge/db_access.py                    | 21 ++++++++
 start_db2.sh                                  |  5 ++
 tests/integration/conftest.py                 | 42 ++++++++++++----
 .../integration/test_column_capitalization.py |  6 ++-
 tests/integration/test_integration.py         | 26 ++++++++--
 tests/integration/test_stats.py               | 13 ++++-
 9 files changed, 156 insertions(+), 19 deletions(-)
 create mode 100755 start_db2.sh

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3429a4f7..25cc7f67 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -155,6 +155,55 @@ jobs:
         with:
           file: ./coverage.xml
 
+
+  linux-integration_tests-db2:
+    name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - DB2"
+    runs-on: ubuntu-20.04
+    env:
+      CI: True
+    strategy:
+      fail-fast: false
+      matrix:
+        PYTHON_VERSION: [ '3.8', '3.9', '3.10' ]
+    services:
+      DB:
+        image: ibmcom/db2:11.5.5.1
+        env:
+          LICENSE: accept
+          DB2INSTANCE: db2inst1
+          DB2INST1_PASSWORD: password
+          DBNAME: testdb
+          UPDATEAVAIL: "NO"
+        options: --privileged
+        ports:
+          - 50000:50000
+        
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.head_ref }}
+      - name: Fetch full git history
+        run: git fetch --prune --unshallow
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: ${{ matrix.PYTHON_VERSION }}
+          miniforge-variant: Mambaforge
+          miniforge-version: 4.11.0-2
+          use-mamba: true
+          environment-file: environment.yml
+          activate-environment: datajudge
+      - name: Run Integration Tests
+        shell: bash -l {0}
+        run: |
+          flit install -s
+          pytest --cov=datajudge --cov-report=xml --cov-append --backend=db2 tests/integration
+      - name: Generate code coverage report
+        uses: codecov/codecov-action@v3.1.1
+        with:
+          file: ./coverage.xml
+
+
   linux-integration_tests-snowflake:
     name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Snowflake"
     runs-on: ubuntu-latest
@@ -228,6 +277,7 @@ jobs:
         with:
           file: ./coverage.xml
 
+
   linux-integration_tests-impala-column-pt1:
     name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt1"
     runs-on: ubuntu-20.04
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 63838237..056a8b4c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,15 @@
 Changelog
 =========
 
-1.3.0 - 2022.xx.xx
+1.4.0 - 2022.02.24
+------------------
+
+**New features**
+
+- Add partial and experimental support for db2 as a backend.
+
+
+1.3.0 - 2022.01.17
 ------------------
 
 **New features**
diff --git a/environment.yml b/environment.yml
index 8687f3bb..312b15fb 100644
--- a/environment.yml
+++ b/environment.yml
@@ -25,3 +25,5 @@ dependencies:
   - flit
   - sphinx-autodoc-typehints
   - impyla
+  - ibm_db
+  - ibm_db_sa
diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py
index a4a6d782..9ee7f640 100644
--- a/src/datajudge/db_access.py
+++ b/src/datajudge/db_access.py
@@ -33,6 +33,10 @@ def is_impala(engine: sa.engine.Engine) -> bool:
     return engine.name == "impala"
 
 
+def is_db2(engine: sa.engine.Engine) -> bool:
+    return engine.name == "ibm_db_sa"
+
+
 def get_table_columns(table, column_names):
     return [table.c[column_name] for column_name in column_names]
 
@@ -421,6 +425,15 @@ def get_date_span(engine, ref, date_column_name):
                 )
             ]
         )
+    elif is_db2(engine):
+        selection = sa.select(
+            [
+                sa.func.days_between(
+                    sa.func.max(column),
+                    sa.func.min(column),
+                )
+            ]
+        )
     else:
         raise NotImplementedError(
             "Date spans not yet implemented for this sql dialect."
@@ -663,6 +676,14 @@ def get_date_gaps(
             )
             > legitimate_gap_size
         )
+    elif is_db2(engine):
+        gap_condition = (
+            sa.func.days_between(
+                start_table.c[start_column],
+                end_table.c[end_column],
+            )
+            > legitimate_gap_size
+        )
     else:
         raise NotImplementedError(f"Date gaps not yet implemented for {engine.name}.")
 
diff --git a/start_db2.sh b/start_db2.sh
new file mode 100755
index 00000000..f384f2ce
--- /dev/null
+++ b/start_db2.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -e
+
+docker run -itd --name mydb2 --privileged=true -p 50000:50000 -e LICENSE=accept -e DB2INST1_PASSWORD=password -e DBNAME=testdb -v ~/database ibmcom/db2
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 9012412f..eee5dc8f 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -8,7 +8,7 @@
 import sqlalchemy as sa
 from impala.dbapi import connect
 
-from datajudge.db_access import apply_patches, is_bigquery, is_impala, is_mssql
+from datajudge.db_access import apply_patches, is_bigquery, is_db2, is_impala, is_mssql
 
 TEST_DB_NAME = "tempdb"
 SCHEMA = "dbo"  # 'dbo' is the standard schema in mssql
@@ -30,6 +30,8 @@ def conn_creator():
 
     if backend == "postgres":
         connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge"
+    if backend == "db2":
+        connection_string = f"db2+ibm_db://db2inst1:password@{address}:50000/testdb"
     elif "mssql" in backend:
         connection_string = (
             f"mssql+pyodbc://sa:datajudge-123@{address}:1433/{TEST_DB_NAME}"
@@ -56,6 +58,12 @@ def conn_creator():
     return engine
 
 
+def _string_column(engine):
+    if is_db2(engine):
+        return sa.String(40)
+    return sa.String()
+
+
 @pytest.fixture(scope="module")
 def engine(backend):
     engine = get_engine(backend)
@@ -111,7 +119,7 @@ def mix_table1(engine, metadata):
     table_name = "mix_table1"
     columns = [
         sa.Column("col_int", sa.Integer()),
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
         sa.Column("col_date", sa.DateTime()),
     ]
     data = [
@@ -131,7 +139,7 @@ def mix_table2(engine, metadata):
     table_name = "mix_table2"
     columns = [
         sa.Column("col_int", sa.Integer()),
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
         sa.Column("col_date", sa.DateTime()),
     ]
     data = [
@@ -152,7 +160,7 @@ def mix_table2_pk(engine, metadata):
     table_name = "mix_table2_pk"
     columns = [
         sa.Column("col_int", sa.Integer(), primary_key=True),
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
         sa.Column("col_date", sa.DateTime()),
     ]
     data = [
@@ -477,7 +485,7 @@ def unique_table1(engine, metadata):
     table_name = "unique_table1"
     columns = [
         sa.Column("col_int", sa.Integer()),
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
     ]
     data = [{"col_int": i // 2, "col_varchar": f"hi{i // 3}"} for i in range(60)]
     data += [
@@ -493,7 +501,7 @@ def unique_table2(engine, metadata):
     table_name = "unique_table2"
     columns = [
         sa.Column("col_int", sa.Integer()),
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
     ]
     data = [{"col_int": i // 2, "col_varchar": f"hi{i // 3}"} for i in range(40)]
     _handle_table(engine, metadata, table_name, columns, data)
@@ -503,7 +511,7 @@ def unique_table2(engine, metadata):
 @pytest.fixture(scope="module")
 def nested_table(engine, metadata):
     table_name = "nested_table"
-    columns = [sa.Column("nested_varchar", sa.String())]
+    columns = [sa.Column("nested_varchar", _string_column(engine))]
     data = [
         {"nested_varchar": "ABC#1,"},
         {"nested_varchar": "ABC#1,DEF#2,"},
@@ -517,7 +525,7 @@ def nested_table(engine, metadata):
 def varchar_table1(engine, metadata):
     table_name = "varchar_table1"
     columns = [
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
     ]
     data = [{"col_varchar": "qq" * i} for i in range(1, 10)]
     data.append({"col_varchar": None})
@@ -529,7 +537,7 @@ def varchar_table1(engine, metadata):
 def varchar_table2(engine, metadata):
     table_name = "varchar_table2"
     columns = [
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
     ]
     data = [{"col_varchar": "qq" * i} for i in range(2, 11)]
     _handle_table(engine, metadata, table_name, columns, data)
@@ -540,7 +548,7 @@ def varchar_table2(engine, metadata):
 def varchar_table_real(engine, metadata):
     table_name = "varchar_table_real"
     columns = [
-        sa.Column("col_varchar", sa.String()),
+        sa.Column("col_varchar", _string_column(engine)),
     ]
     data = [
         {"col_varchar": val}
@@ -754,6 +762,10 @@ def capitalization_table(engine, metadata):
         str_datatype = "STRING"
         # Impala supports primary keys but uses a different grammar.
         primary_key = ""
+    elif is_db2(engine):
+        str_datatype = "VARCHAR(20)"
+        # Primary key needs to be non-nullable.
+        primary_key = ""
     else:
         str_datatype = "TEXT"
     with engine.connect() as connection:
@@ -796,7 +808,15 @@ def pytest_addoption(parser):
     parser.addoption(
         "--backend",
         choices=(
-            ("mssql", "mssql-freetds", "postgres", "snowflake", "bigquery", "impala")
+            (
+                "mssql",
+                "mssql-freetds",
+                "postgres",
+                "snowflake",
+                "bigquery",
+                "impala",
+                "db2",
+            )
         ),
         help="which database backend to use to run the integration tests",
     )
diff --git a/tests/integration/test_column_capitalization.py b/tests/integration/test_column_capitalization.py
index 688474d0..cef55ccf 100644
--- a/tests/integration/test_column_capitalization.py
+++ b/tests/integration/test_column_capitalization.py
@@ -1,7 +1,7 @@
 import pytest
 
 from datajudge import Condition, WithinRequirement
-from datajudge.db_access import is_bigquery, is_impala, is_mssql, is_postgresql
+from datajudge.db_access import is_bigquery, is_db2, is_impala, is_mssql, is_postgresql
 
 # These tests
 
@@ -21,6 +21,10 @@ def test_column_existence(
         )
     if is_postgresql(engine):
         pytest.skip("Postgres interface always expects lower-cased columns.")
+    if is_db2(engine) and use_uppercase_query:
+        pytest.skip(
+            "Db2 interface transforms writes to lower-case, expects lower-case reads."
+        )
     (
         db_name,
         schema_name,
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
index 9cce777c..51556842 100644
--- a/tests/integration/test_integration.py
+++ b/tests/integration/test_integration.py
@@ -6,6 +6,7 @@
 from datajudge.db_access import (
     Condition,
     is_bigquery,
+    is_db2,
     is_impala,
     is_mssql,
     is_postgresql,
@@ -1272,7 +1273,7 @@ def test_varchar_regex_within(engine, mix_table1, computation_in_db, data):
     req = requirements.WithinRequirement.from_table(*mix_table1)
     if computation_in_db:
         # bigquery dialect does not support regular expressions (sqlalchemy-bigquery 1.4.4)
-        if is_mssql(engine) or is_bigquery(engine):
+        if is_mssql(engine) or is_bigquery(engine) or is_db2(engine):
             pytest.skip("Functionality not supported by given dialect.")
         req.add_varchar_regex_constraint_db(
             column="col_varchar",
@@ -1324,7 +1325,7 @@ def test_varchar_regex_tolerance(engine, varchar_table_real, computation_in_db,
     req = requirements.WithinRequirement.from_table(*varchar_table_real)
     if computation_in_db:
         # The feature is not supported in sqlalchemy-bigquery 1.4.4
-        if is_mssql(engine) or is_bigquery(engine):
+        if is_mssql(engine) or is_bigquery(engine) or is_db2(engine):
             pytest.skip("Functionality not supported by given dialect.")
         req.add_varchar_regex_constraint_db(
             "col_varchar",
@@ -1366,7 +1367,7 @@ def test_varchar_regex_counterexample(
     req = requirements.WithinRequirement.from_table(*varchar_table_real)
     if computation_in_db:
         # The feature is not supported in sqlalchemy-bigquery 1.4.4
-        if is_mssql(engine) or is_bigquery(engine):
+        if is_mssql(engine) or is_bigquery(engine) or is_db2(engine):
             pytest.skip("Functionality not supported by given dialect.")
         req.add_varchar_regex_constraint_db(
             "col_varchar",
@@ -1446,6 +1447,7 @@ def test_backend_dependent_condition(engine, mix_table1):
         or is_snowflake(engine)
         or is_bigquery(engine)
         or is_impala(engine)
+        or is_db2(engine)
     ):
         condition = Condition(raw_string="LENGTH(col_varchar) = 3")
     else:
@@ -1904,6 +1906,9 @@ def test_row_superset_between(engine, mix_table2, mix_table1, data):
     ],
 )
 def test_row_matching_equality(engine, row_match_table1, row_match_table2, data):
+    # TODO: Not sure why this doesn't work
+    if is_db2(engine):
+        pytest.skip()
     if is_impala(engine):
         pytest.skip("Currently not implemented for Impala. EXCEPT throws syntax error.")
     (
@@ -1933,6 +1938,9 @@ def test_row_matching_equality(engine, row_match_table1, row_match_table2, data)
 @pytest.mark.parametrize("key", [("some_id",), ("some_id", "extra_id")])
 def test_groupby_aggregation_within(engine, groupby_aggregation_table_correct, key):
     skip_if_mssql(engine)
+    # TODO: This shoud actually work for db2
+    if is_db2(engine):
+        pytest.skip()
     if is_impala(engine):
         pytest.skip("array_agg does not exist for Impala.")
     req = requirements.WithinRequirement.from_table(*groupby_aggregation_table_correct)
@@ -1947,6 +1955,8 @@ def test_groupby_aggregation_within_with_failures(
     engine, groupby_aggregation_table_incorrect, tolerance, operation, key
 ):
     skip_if_mssql(engine)
+    if is_db2(engine):
+        pytest.skip()
     if is_impala(engine):
         pytest.skip("array_agg does not exist for Impala.")
     req = requirements.WithinRequirement.from_table(
@@ -1975,6 +1985,8 @@ def test_ks_2sample_constraint_perfect_between(engine, int_table1, data):
     """
     Test Kolmogorov-Smirnov for the same column -> p-value should be perfect 1.0.
     """
+    if is_db2(engine):
+        pytest.skip()
     (operation, col_1, col_2, condition1, condition2, significance_level) = data
     req = requirements.BetweenRequirement.from_tables(*int_table1, *int_table1)
     req.add_ks_2sample_constraint(
@@ -2013,6 +2025,9 @@ def test_ks_2sample_constraint_perfect_between_different_conditions(
     As a consequence, since the data is distinct, the tests are expected
     to fail for a very high significance level.
     """
+    # TODO: Figure out why this is necessary.
+    if is_db2(engine):
+        pytest.skip()
     req = requirements.BetweenRequirement.from_tables(*int_table1, *int_table1)
     req.add_ks_2sample_constraint(
         column1="col_int",
@@ -2035,6 +2050,9 @@ def test_ks_2sample_constraint_wrong_between(
     """
     Test kolmogorov smirnov test for table and square of table -> significance level should be less than default 0.05
     """
+    # TODO: Figure out why this is necessary.
+    if is_db2(engine):
+        pytest.skip()
     (operation, col_1, col_2, min_p_value) = data
     req = requirements.BetweenRequirement.from_tables(*int_table1, *int_square_table)
     req.add_ks_2sample_constraint(
@@ -2063,7 +2081,7 @@ def test_ks_2sample_constraint_wrong_between(
     ],
 )
 def test_ks_2sample_random(engine, random_normal_table, configuration):
-    if is_bigquery(engine) or is_impala(engine):
+    if is_bigquery(engine) or is_impala(engine) or is_db2(engine):
         pytest.skip("It takes too long to insert the table.")
 
     (operation, col_1, col_2, min_p_value) = configuration
diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py
index 6912d1a8..812a6ebc 100644
--- a/tests/integration/test_stats.py
+++ b/tests/integration/test_stats.py
@@ -1,10 +1,19 @@
 import pytest
 
 import datajudge
-from datajudge.db_access import DataReference, TableDataSource, is_bigquery, is_impala
+from datajudge.db_access import (
+    DataReference,
+    TableDataSource,
+    is_bigquery,
+    is_db2,
+    is_impala,
+)
 
 
 def test_cross_cdf_selection(engine, cross_cdf_table1, cross_cdf_table2):
+    # TODO: Fix this
+    if is_db2(engine):
+        pytest.skip()
     database1, schema1, table1 = cross_cdf_table1
     database2, schema2, table2 = cross_cdf_table2
     tds1 = TableDataSource(database1, table1, schema1)
@@ -38,7 +47,7 @@ def test_cross_cdf_selection(engine, cross_cdf_table1, cross_cdf_table2):
     ],
 )
 def test_ks_2sample_calculate_statistic(engine, random_normal_table, configuration):
-    if is_bigquery(engine) or is_impala(engine):
+    if is_bigquery(engine) or is_impala(engine) or is_db2(engine):
         pytest.skip("It takes too long to insert the table into BigQuery")
 
     col_1, col_2, expected_d, expected_p = configuration