diff --git a/.build/.versions.yml b/.build/.versions.yml index b8d9345..b7282a0 100644 --- a/.build/.versions.yml +++ b/.build/.versions.yml @@ -35,7 +35,7 @@ compatibility-matrix: scala_version: [2.12, 2.13] hadoop_version: 3 spark_download_url: https://archive.apache.org/dist/spark/ -### Ovveride the matrix above by providing the versions to build +### Override the matrix above by providing the versions to build ### 1- The build-matrix is empty: build with all possible combintations ### 2- Override specific versions: build with all possible combinations which are compatible with that specific versions ### 3- The versions not present on compatibility-matrix are ignored diff --git a/.github/workflows/build-base.yml b/.github/workflows/build-test-base.yml similarity index 100% rename from .github/workflows/build-base.yml rename to .github/workflows/build-test-base.yml diff --git a/.github/workflows/build-datascience.yml b/.github/workflows/build-test-datascience.yml similarity index 100% rename from .github/workflows/build-datascience.yml rename to .github/workflows/build-test-datascience.yml diff --git a/.github/workflows/build-spark.yml b/.github/workflows/build-test-spark.yml similarity index 100% rename from .github/workflows/build-spark.yml rename to .github/workflows/build-test-spark.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dad509d..8b3eb04 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,7 +25,7 @@ on: push: branches: - - build-pipeline + - main paths: - ".github/workflows/main.yml" - ".github/workflows/build-base.yml" @@ -95,13 +95,13 @@ jobs: needs: [run-unit-tests] build-base: - name: build-base (python-${{ matrix.python.python_version }}) + name: build-test-base (python-${{ matrix.python.python_version }}) strategy: # 3 Jobs in //, the base jobs run in sequential max-parallel: 3 matrix: python: ${{ fromJson(needs.build-version-compatibility-matrix.outputs.python) }} - uses: ./.github/workflows/build-base.yml + uses: ./.github/workflows/build-test-base.yml with: python_version: ${{ matrix.python.python_version }} python_dev_tag: ${{ matrix.python.python_dev_tag }} @@ -113,13 +113,13 @@ jobs: needs: [build-version-compatibility-matrix] build-datascience: - name: build-datascience (python-${{ matrix.python.python_version }}) + name: build-test-datascience (python-${{ matrix.python.python_version }}) strategy: # 1 matrix call = +2 jobs in // (check the number here build-datascience.yml) max-parallel: 1 matrix: python: ${{ fromJson(needs.build-version-compatibility-matrix.outputs.python) }} - uses: ./.github/workflows/build-datascience.yml + uses: ./.github/workflows/build-test-datascience.yml with: python_dev_tag: ${{ matrix.python.python_dev_tag }} registry: ${{ vars.REGISTRY || 'ghcr.io' }} @@ -130,13 +130,13 @@ jobs: needs: [build-version-compatibility-matrix, build-base] build-spark: - name: build-spark (python-${{ matrix.spark.python_version }}) + name: build-test-spark (python-${{ matrix.spark.python_version }}) strategy: # 2 jobs in // max-parallel: 2 matrix: spark: ${{ fromJson(needs.build-version-compatibility-matrix.outputs.spark) }} - uses: ./.github/workflows/build-spark.yml + uses: ./.github/workflows/build-test-spark.yml with: spark_download_url: ${{ matrix.spark.spark_download_url }} python_version: ${{ matrix.spark.python_version }} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a85fe29..8c07a70 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,7 +22,7 @@ jobs: uses: ./docker-stacks/.github/actions/create-dev-env - name: Run unit tests - run: pytest python/tests -v + run: pytest python/tests -v --color=yes shell: bash diff --git a/README.md b/README.md index 9649b72..8bcf1ed 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,131 @@ -# kdp-docker-stacks +# OKDP Jupyter Images -kdp jupyter images based on https://github.com/jupyter/docker-stacks +[![Build, test, tag, and push jupyter images](https://github.com/OKDP/jupyterlab-docker/actions/workflows/main.yml/badge.svg)](https://github.com/OKDP/jupyterlab-docker/actions/workflows/main.yml) +OKDP jupyter docker images based on [jupyter docker-stacks](https://github.com/jupyter/docker-stacks) source dockerfiles. It includes (read only copy) [jupyter docker-stacks](https://github.com/jupyter/docker-stacks) repository as a [git-subtree](https://www.atlassian.com/git/tutorials/git-subtree) sub project. -# Extension +The project leverages the features provided by [jupyter docker-stacks](https://github.com/jupyter/docker-stacks): +- Build from the original [source docker files](docker-stacks/images) +- Customize the images by using docker ```build-arg``` build arguments +- Run the original [tests](docker-stacks/tests) at every pipeline trigger +The project provides an up to date jupyter lab images especially for pyspark. -# Initial setup +# Images build workflow +## Build/Test + +The [main](.github/workflows/main.yml) build pipeline contains 6 main reusable workflows: + +1. [build-test-base](.github/workflows/build-test-base.yml): docker-stacks-foundation, base-notebook, minimal-notebook, scipy-notebook +2. [build-test-datascience](.github/workflows/build-test-datascience.yml): r-notebook, julia-notebook, tensorflow-notebook, pytorch-notebook +3. [build-test-spark](.github/workflows/build-test-spark.yml): pyspark-notebook, all-spark-notebook +4. [tag-push](.github/workflows/docker-tag-push.yml): push the built images to the container registry (main branch only) +5. [auto-rerun](.github/workflows/auto-rerun.yml): partially re-run jobs in case of failures (github runner issues/main branch only) +6. [unit-tests](.github/workflows/unit-tests.yml): run the unit tests (okdp extension) at every pipeline trigger + +![build pipeline](doc/_images/build-pipeline.png) + +The build is based on the [version compatibility matrix](.build/.versions.yml). + +The [build-matrix](.build/.versions.yml#L42) section defines the components versions to build. It behaves like a filter of the parent [version compatibility matrix](.build/.versions.yml) to limit the versions combintations to build. The build process ensures only the compatible versions are built: + +For example, the following build-matrix: + +```yaml +build-matrix: + python_version: ['3.9', '3.10', '3.11'] + spark_version: [3.2.4, 3.3.4, 3.4.2, 3.5.0] + java_version: [11, 17] + scala_version: [2.12] +``` + +Will build the following versions combinations in regards to [compatibility-matrix](.build/.versions.yml#5) section: +- spark3.3.4-python3.10-java17-scala2.12 +- spark3.5.0-python3.11-java17-scala2.12 +- spark3.4.2-python3.11-java17-scala2.12 +- spark3.2.4-python3.9-java11-scala2.12 + +By default, if no filter is specified: + +```yaml +build-matrix: +``` + +All compatible versions combinations are built. + +Finally, all the images are tested against the original [tests](docker-stacks/tests) at every pipeline trigger + +## Push + +Development images with tags ```--latest``` suffix (ex.: spark3.2.4-python3.9-java11-scala2.12--latest) are produced at every pipeline run regardless of the git branch (main or not). + +The [official images](#tagging) are pushed to the [container registry](https://github.com/orgs/OKDP/packages) when: + +1. The workflow is triggered on the main branch only and +2. The [tests](#build/test) are completed successfully + +This prevents pull requests or developement branchs to push the official images before they are reviewed or tested. It also provides the flexibility to test against developement images ```--latest``` before they are officially pushed. + +## Tagging + +The project builds the images with a long format tags. Each tag combines multiple compatible versions combinations. + +There are multiple tags levels and the format to use is depending on your convenience in term of stability and reproducibility. + +Here are some examples: + +### scipy-notebook: +- python-3.11-2024-02-06 +- python-3.11.7-2024-02-06 +- python-3.11.7-hub-4.0.2-lab-4.1.0 +- python-3.11.7-hub-4.0.2-lab-4.1.0-2024-02-06 + +### datascience-notebook: +- python-3.9-2024-02-06 +- python-3.9.18-2024-02-06 +- python-3.9.18-hub-4.0.2-lab-4.1.0 +- python-3.9.18-hub-4.0.2-lab-4.1.0-2024-02-06 +- python-3.9.18-r-4.3.2-julia-1.10.0-2024-02-06 +- python-3.9.18-r-4.3.2-julia-1.10.0-hub-4.0.2-lab-4.1.0 +- python-3.9.18-r-4.3.2-julia-1.10.0-hub-4.0.2-lab-4.1.0-2024-02-06 + +### pyspark-notebook: +- spark-3.5.0-python-3.11-java-17-scala-2.12 +- spark-3.5.0-python-3.11-java-17-scala-2.12-2024-02-06 +- spark-3.5.0-python-3.11.7-java-17.0.9-scala-2.12.18-hub-4.0.2-lab-4.1.0 +- spark-3.5.0-python-3.11.7-java-17.0.9-scala-2.12.18-hub-4.0.2-lab-4.1.0-2024-02-06 +- spark-3.5.0-python-3.11.7-r-4.3.2-java-17.0.9-scala-2.12.18-hub-4.0.2-lab-4.1.0 +- spark-3.5.0-python-3.11.7-r-4.3.2-java-17.0.9-scala-2.12.18-hub-4.0.2-lab-4.1.0-2024-02-06 + +Please, check the [container registry](https://github.com/orgs/OKDP/packages) for more images and tags. + +# Build locally with Act + +[Act](https://github.com/nektos/act) can be used to build and test locally. + +Here is an example command: ```shell -git remote add docker-stacks https://github.com/jupyter/docker-stacks.git -git subtree add --prefix=docker-stacks --squash docker-stacks main +$ act --container-architecture linux/amd64 \ + -W .github/workflows/main.yml \ + --env ACT_SKIP_TESTS= \ + --var REGISTRY=ghcr.io \ + --secret REGISTRY_USERNAME= \ + --secret REGISTRY_ROBOT_TOKEN= + --rm ``` +set the option ```--container-architecture linux/amd64``` if you are running locally with Apple's M1/M2 chips. + +For more information: + ```shell -act --container-architecture linux/amd64 \ - -W .github/workflows/docker.yml \ - --artifact-server-path /tmp/act/artifacts \ - --env ACT_SKIP_TESTS= \ - --env PUSH_TO_REGISTRY=true \ - --env REGISTRY=ghcr.io \ - --secret REGISTRY_USERNAME= \ - --secret REGISTRY_ROBOT_TOKEN= - --rm -``` \ No newline at end of file +$ act --help +``` + +# OKDP custom extensions + +1. [Tagging extension](python/okdp/extension/tagging) is based on the original [jupyter docker-stacks](docker-stacks/tagging) source files +2. [Patchs](python/okdp/patch/README.md) patchs the original [jupyter docker-stacks](docker-stacks/tests) in order to run the tests +3. [Version compatibility matrix](python/okdp/extension/matrix) to generate all the compatible versions combintations for pyspark +4. [Unit tests](python/tests) in order to test okdp extension at every pipeline run \ No newline at end of file diff --git a/doc/_images/build-pipeline.png b/doc/_images/build-pipeline.png new file mode 100644 index 0000000..466bde3 Binary files /dev/null and b/doc/_images/build-pipeline.png differ diff --git a/python/tests/extension/matrix/test_version_compatibility_matrix.py b/python/tests/extension/matrix/test_version_compatibility_matrix.py index 1fe19b4..3625917 100644 --- a/python/tests/extension/matrix/test_version_compatibility_matrix.py +++ b/python/tests/extension/matrix/test_version_compatibility_matrix.py @@ -42,7 +42,7 @@ def test_group_versions_by( """ assert group_versions_by(version_compatibility_matrix_data, group_on=group_on) == to_dict(expected) -def test_build_matrix_empty( +def test_filter_by_empty_versions( version_compatibility_matrix_data: list[dict], ) -> None: # Given: version_compatibility_matrix_data @@ -67,7 +67,7 @@ def test_build_matrix_empty( assert to_dict(expected_build_matrix_empty) == spark_matrix -def test_filter_spark_version( +def test_filter_by_spark_version( version_compatibility_matrix_data: list[dict], ) -> None: # Given: version_compatibility_matrix_data @@ -111,7 +111,7 @@ def test_filter_spark_version( assert spark_matrix == to_dict(expected_test_filter_spark_version) assert python_version == to_dict("""[{"python_version": "3.9", "python_dev_tag": "python3.9-main-latest"}]""") -def test_filter_spark_version_scala_version( +def test_filter_by_spark_version_and_scala_version( version_compatibility_matrix_data: list[dict], ) -> None: # Given: version_compatibility_matrix_data @@ -145,7 +145,80 @@ def test_filter_spark_version_scala_version( assert spark_matrix == to_dict(expected_test_filter_spark_version) assert python_version == to_dict("""[{"python_version": "3.9", "python_dev_tag": "python3.9-main-latest"}]""") -def test_filter_wrong_version( +def test_filter_by_multiple_versions( + version_compatibility_matrix_data: list[dict], +) -> None: + # Given: version_compatibility_matrix_data + version_compatibility_matrix = version_compatibility_matrix_data + # The python_version is not supported by the compatibilty matrix + build_matrix = { + "python_version": ["3.9", "3.10", "3.11"], + "spark_version": ["3.2.4", "3.3.4", "3.4.2", "3.5.0"], + "java_version": [11, 17], + "scala_version": [2.12] + } + + # When: + vcm = MockedVersionCompatibilityMatrix(compatibility_matrix = version_compatibility_matrix, + build_matrix = build_matrix, + git_branch="main") + vcm._normalize_values_() + (spark_matrix, python_version) = vcm.generate_matrix() + + # Then: check the number of combinations when the build_matrix is empty + expected_nb_combinations = 4 + actual_nb_combinations = len(spark_matrix) + assert actual_nb_combinations == expected_nb_combinations, f"spark_matrix: The number of elements should be {expected_nb_combinations}, got {actual_nb_combinations}" + + assert spark_matrix == to_dict("""[ + { + "python_version": "3.10", + "spark_version": "3.3.4", + "java_version": "17", + "scala_version": "", + "hadoop_version": "3", + "spark_download_url": "https://archive.apache.org/dist/spark/", + "spark_dev_tag": "spark3.3.4-python3.10-java17-scala2.12-main-latest", + "python_dev_tag": "python3.10-main-latest" + }, + { + "python_version": "3.11", + "spark_version": "3.5.0", + "java_version": "17", + "scala_version": "", + "hadoop_version": "3", + "spark_download_url": "https://archive.apache.org/dist/spark/", + "spark_dev_tag": "spark3.5.0-python3.11-java17-scala2.12-main-latest", + "python_dev_tag": "python3.11-main-latest" + }, + { + "python_version": "3.11", + "spark_version": "3.4.2", + "java_version": "17", + "scala_version": "", + "hadoop_version": "3", + "spark_download_url": "https://archive.apache.org/dist/spark/", + "spark_dev_tag": "spark3.4.2-python3.11-java17-scala2.12-main-latest", + "python_dev_tag": "python3.11-main-latest" + }, + { + "python_version": "3.9", + "spark_version": "3.2.4", + "java_version": "11", + "scala_version": "", + "hadoop_version": "3.2", + "spark_download_url": "https://archive.apache.org/dist/spark/", + "spark_dev_tag": "spark3.2.4-python3.9-java11-scala2.12-main-latest", + "python_dev_tag": "python3.9-main-latest" + } + ]""") + assert python_version == to_dict("""[ + {"python_version": "3.10", "python_dev_tag": "python3.10-main-latest"}, + {"python_version": "3.11", "python_dev_tag": "python3.11-main-latest"}, + {"python_version": "3.9", "python_dev_tag": "python3.9-main-latest"} + ]""") + +def test_filter_by_wrong_version( version_compatibility_matrix_data: list[dict], ) -> None: # Given: version_compatibility_matrix_data @@ -165,4 +238,3 @@ def test_filter_wrong_version( actual_nb_combinations = len(spark_matrix) assert actual_nb_combinations == expected_nb_combinations, f"spark_matrix: The number of elements should be {expected_nb_combinations}, got {actual_nb_combinations}" assert len(python_version) == expected_nb_combinations, f"python_version: The number of elements should be {expected_nb_combinations}, got {actual_nb_combinations}" -