diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 9758d746c87..884e3eef50b 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,24 +18,16 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-24.08 + - branch-* types: [closed] jobs: auto-merge: if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - ref: branch-24.08 # force to fetch from latest upstream instead of PR ref - - - name: auto-merge job - uses: ./.github/workflows/auto-merge - env: - OWNER: NVIDIA - REPO_NAME: spark-rapids - HEAD: branch-24.08 - BASE: branch-24.10 - AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR + uses: NVIDIA/spark-rapids-common/.github/workflows/auto-merge.yml@main + with: + owner: ${{ github.repository_owner }} + repo: spark-rapids + branch: ${{ github.event.pull_request.base.ref }} + secrets: + token: ${{ secrets.AUTOMERGE_TOKEN }} diff --git a/.github/workflows/auto-merge/Dockerfile b/.github/workflows/auto-merge/Dockerfile deleted file mode 100644 index b85588e4dcc..00000000000 --- a/.github/workflows/auto-merge/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM python:alpine - -WORKDIR / -COPY automerge . -RUN pip install requests && chmod +x /automerge - -# require envs: OWNER,REPO_NAME,HEAD,BASE,GITHUB_TOKEN -ENTRYPOINT ["/automerge"] diff --git a/.github/workflows/auto-merge/action.yml b/.github/workflows/auto-merge/action.yml deleted file mode 100644 index 7bde13464b4..00000000000 --- a/.github/workflows/auto-merge/action.yml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: 'auto-merge action' -description: 'auto-merge HEAD to BASE' -runs: - using: 'docker' - image: 'Dockerfile' - diff --git a/.github/workflows/auto-merge/automerge b/.github/workflows/auto-merge/automerge deleted file mode 100755 index 31c53f394b5..00000000000 --- a/.github/workflows/auto-merge/automerge +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A auto-merge tool - -Create a PR to merge HEAD to BASE branch. -NOTE: - The generated PR should be automatically merged if no conflict. Otherwise, manual operation will be required. -""" - -import os -import sys -import time - -import requests - -# ENV -OWNER = os.environ.get('OWNER') -assert OWNER, 'env OWNER should not be empty' -REPO_NAME = os.environ.get('REPO_NAME') -assert REPO_NAME, 'env REPO_NAME should not be empty' -HEAD = os.environ.get('HEAD') -assert HEAD, 'env HEAD should not be empty' -BASE = os.environ.get('BASE') -assert BASE, 'env BASE should not be empty' -AUTOMERGE_TOKEN = os.environ.get('AUTOMERGE_TOKEN') -assert AUTOMERGE_TOKEN, 'env AUTOMERGE_TOKEN should not be empty' -# static -API_URL = 'https://api.github.com' -AUTH_HEADERS = { - 'Authorization': 'token ' + AUTOMERGE_TOKEN -} - - -def create(): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/pulls' - params = { - 'title': f'[auto-merge] {HEAD} to {BASE} [skip ci] [bot]', - 'head': HEAD, - 'base': BASE, - 'body': f'auto-merge triggered by github actions on `{HEAD}` to create a PR keeping `{BASE}` up-to-date. If ' - 'this PR is unable to be merged due to conflicts, it will remain open until manually fix.', - 'maintainer_can_modify': True - } - r = requests.post(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 201: - print('SUCCESS - create PR') - pull = r.json() - number = str(pull['number']) - sha = str(pull['head']['sha']) - return number, sha, False - if r.status_code == 422: # early-terminate if no commits between HEAD and BASE - print('SUCCESS - No commits') - print(r.json()) - return '', '', True - # FAILURE - print('FAILURE - create PR') - print(f'status code: {r.status_code}') - print(r.json()) - sys.exit(1) - - -def auto_merge(number, sha): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/pulls/{number}/merge' - params = { - 'sha': sha, - 'merge_method': 'merge' - } - r = requests.put(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 200: - comment(number, '**SUCCESS** - auto-merge') - print('SUCCESS - auto-merge') - sys.exit(0) - else: - print('FAILURE - auto-merge') - comment(number=number, content=f"""**FAILURE** - Unable to auto-merge. Manual operation is required. -``` -{r.json()} -``` - -Please use the following steps to fix the merge conflicts manually: -``` -# Assume upstream is NVIDIA/spark-rapids remote -git fetch upstream {HEAD} {BASE} -git checkout -b fix-auto-merge-conflict-{number} upstream/{BASE} -git merge upstream/{HEAD} -# Fix any merge conflicts caused by this merge -git commit -am "Merge {HEAD} into {BASE}" -git push fix-auto-merge-conflict-{number} -# Open a PR targets NVIDIA/spark-rapids {BASE} -``` -**IMPORTANT:** Before merging this PR, be sure to change the merging strategy to `Create a merge commit` (repo admin only). - -Once this PR is merged, the auto-merge PR should automatically be closed since it contains the same commit hashes -""") - print(f'status code: {r.status_code}') - print(r.json()) - sys.exit(1) - - -def comment(number, content): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/issues/{number}/comments' - params = { - 'body': content - } - r = requests.post(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 201: - print('SUCCESS - create comment') - else: - print('FAILURE - create comment') - print(f'status code: {r.status_code}') - print(r.json()) - - -def main(): - number, sha, term = create() - if term: - sys.exit(0) - - auto_merge(number, sha) - - -if __name__ == '__main__': - main() diff --git a/CHANGELOG.md b/CHANGELOG.md index 02e43a88303..4e258e1d66a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,159 @@ # Change log -Generated on 2024-08-18 +Generated on 2024-10-14 + +## Release 24.10 + +### Features +||| +|:---|:---| +|[#11525](https://github.com/NVIDIA/spark-rapids/issues/11525)|[FEA] If dump always is enabled dump before decoding the file| +|[#11461](https://github.com/NVIDIA/spark-rapids/issues/11461)|[FEA] Support non-UTC timezone for casting from date to timestamp| +|[#11445](https://github.com/NVIDIA/spark-rapids/issues/11445)|[FEA] Support format 'yyyyMMdd' in GetTimestamp operator| +|[#11442](https://github.com/NVIDIA/spark-rapids/issues/11442)|[FEA] Add in support for setting row group sizes for parquet| +|[#11330](https://github.com/NVIDIA/spark-rapids/issues/11330)|[FEA] Add companion metrics for all nsTiming metrics to measure time elapsed excluding semaphore wait| +|[#5223](https://github.com/NVIDIA/spark-rapids/issues/5223)|[FEA] Support array_join| +|[#10968](https://github.com/NVIDIA/spark-rapids/issues/10968)|[FEA] support min_by function| +|[#10437](https://github.com/NVIDIA/spark-rapids/issues/10437)|[FEA] Add Spark 3.5.2 snapshot support| + +### Performance +||| +|:---|:---| +|[#10799](https://github.com/NVIDIA/spark-rapids/issues/10799)|[FEA] Optimize count distinct performance optimization with null columns reuse and post expand coalesce| +|[#8301](https://github.com/NVIDIA/spark-rapids/issues/8301)|[FEA] semaphore prioritization| +|[#11234](https://github.com/NVIDIA/spark-rapids/issues/11234)|Explore swapping build table for left outer joins| +|[#11263](https://github.com/NVIDIA/spark-rapids/issues/11263)|[FEA] Cluster/pack multi_get_json_object paths by common prefixes| + +### Bugs Fixed +||| +|:---|:---| +|[#11573](https://github.com/NVIDIA/spark-rapids/issues/11573)|[BUG] very long tail task is observed when many tasks are contending for PrioritySemaphore| +|[#11367](https://github.com/NVIDIA/spark-rapids/issues/11367)|[BUG] Error "table_view.cpp:36: Column size mismatch" when using approx_percentile on a string column| +|[#11543](https://github.com/NVIDIA/spark-rapids/issues/11543)|[BUG] test_yyyyMMdd_format_for_legacy_mode[DATAGEN_SEED=1727619674, TZ=UTC] failed GPU and CPU are not both null| +|[#11500](https://github.com/NVIDIA/spark-rapids/issues/11500)|[BUG] dataproc serverless Integration tests failing in json_matrix_test.py| +|[#11384](https://github.com/NVIDIA/spark-rapids/issues/11384)|[BUG] "rs. shuffle write time" negative values seen in app history log| +|[#11509](https://github.com/NVIDIA/spark-rapids/issues/11509)|[BUG] buildall no longer works| +|[#11501](https://github.com/NVIDIA/spark-rapids/issues/11501)|[BUG] test_yyyyMMdd_format_for_legacy_mode failed in Dataproc Serverless integration tests| +|[#11502](https://github.com/NVIDIA/spark-rapids/issues/11502)|[BUG] IT script failed get jars as we stop deploying intermediate jars since 24.10| +|[#11479](https://github.com/NVIDIA/spark-rapids/issues/11479)|[BUG] spark400 build failed do not conform to class UnaryExprMeta's type parameter| +|[#8558](https://github.com/NVIDIA/spark-rapids/issues/8558)|[BUG] `from_json` generated inconsistent result comparing with CPU for input column with nested json strings| +|[#11485](https://github.com/NVIDIA/spark-rapids/issues/11485)|[BUG] Integration tests failing in join_test.py| +|[#11481](https://github.com/NVIDIA/spark-rapids/issues/11481)|[BUG] non-utc integration tests failing in json_test.py| +|[#10911](https://github.com/NVIDIA/spark-rapids/issues/10911)|from_json: when input is a bad json string, rapids would throw an exception.| +|[#10457](https://github.com/NVIDIA/spark-rapids/issues/10457)|[BUG] ScanJson and JsonToStructs allow unquoted control chars by default| +|[#10479](https://github.com/NVIDIA/spark-rapids/issues/10479)|[BUG] JsonToStructs and ScanJson should return null for non-numeric, non-boolean non-quoted strings| +|[#10534](https://github.com/NVIDIA/spark-rapids/issues/10534)|[BUG] Need Improved JSON Validation | +|[#11436](https://github.com/NVIDIA/spark-rapids/issues/11436)|[BUG] Mortgage unit tests fail with RAPIDS shuffle manager| +|[#11437](https://github.com/NVIDIA/spark-rapids/issues/11437)|[BUG] array and map casts to string tests failed| +|[#11463](https://github.com/NVIDIA/spark-rapids/issues/11463)|[BUG] hash_groupby_approx_percentile failed assert is None| +|[#11465](https://github.com/NVIDIA/spark-rapids/issues/11465)|[BUG] java.lang.NoClassDefFoundError: org/apache/spark/BuildInfo$ in non-databricks environment| +|[#11359](https://github.com/NVIDIA/spark-rapids/issues/11359)|[BUG] a couple of arithmetic_ops_test.py cases failed mismatching cpu and gpu values with [DATAGEN_SEED=1723985531, TZ=UTC, INJECT_OOM]| +|[#11392](https://github.com/NVIDIA/spark-rapids/issues/11392)|[AUDIT] Handle IgnoreNulls Expressions for Window Expressions| +|[#10770](https://github.com/NVIDIA/spark-rapids/issues/10770)|[BUG] Slow/no progress with cascaded pandas udfs/mapInPandas in Databricks| +|[#11397](https://github.com/NVIDIA/spark-rapids/issues/11397)|[BUG] We should not be using copyWithBooleanColumnAsValidity unless we can prove it is 100% safe| +|[#11372](https://github.com/NVIDIA/spark-rapids/issues/11372)|[BUG] spark400 failed compiling datagen_2.13| +|[#11364](https://github.com/NVIDIA/spark-rapids/issues/11364)|[BUG] Missing numRows in the ColumnarBatch created in GpuBringBackToHost| +|[#11350](https://github.com/NVIDIA/spark-rapids/issues/11350)|[BUG] spark400 compile failed in scala213| +|[#11346](https://github.com/NVIDIA/spark-rapids/issues/11346)|[BUG] databrick nightly failing with not able to get spark-version-info.properties| +|[#9604](https://github.com/NVIDIA/spark-rapids/issues/9604)|[BUG] Delta Lake metadata query detection can trigger extra file listing jobs| +|[#11318](https://github.com/NVIDIA/spark-rapids/issues/11318)|[BUG] GPU query is case sensitive on Hive text table's column name| +|[#10596](https://github.com/NVIDIA/spark-rapids/issues/10596)|[BUG] ScanJson and JsonToStructs does not deal with escaped single quotes properly| +|[#10351](https://github.com/NVIDIA/spark-rapids/issues/10351)|[BUG] test_from_json_mixed_types_list_struct failed| +|[#11294](https://github.com/NVIDIA/spark-rapids/issues/11294)|[BUG] binary-dedupe leaves around a copy of "unshimmed" class files in spark-shared| +|[#11183](https://github.com/NVIDIA/spark-rapids/issues/11183)|[BUG] Failed to split an empty string with error "ai.rapids.cudf.CudfException: parallel_for failed: cudaErrorInvalidDevice: invalid device ordinal"| +|[#11008](https://github.com/NVIDIA/spark-rapids/issues/11008)|Fix tests failures in ast_test.py| +|[#11265](https://github.com/NVIDIA/spark-rapids/issues/11265)|[BUG] segfaults seen in cuDF after prefetch calls intermittently| +|[#11025](https://github.com/NVIDIA/spark-rapids/issues/11025)|Fix tests failures in date_time_test.py| +|[#11065](https://github.com/NVIDIA/spark-rapids/issues/11065)|[BUG] Spark Connect Server (3.5.1) Can Not Running Correctly| + +### PRs +||| +|:---|:---| +|[#11576](https://github.com/NVIDIA/spark-rapids/pull/11576)|Update rapids JNI and private dependency to 24.10.0| +|[#11582](https://github.com/NVIDIA/spark-rapids/pull/11582)|[DOC] update doc for 24.10 release [skip ci]| +|[#11588](https://github.com/NVIDIA/spark-rapids/pull/11588)|backport fixes of #11573 to branch 24.10| +|[#11569](https://github.com/NVIDIA/spark-rapids/pull/11569)|Have "dump always" dump input files before trying to decode them| +|[#11567](https://github.com/NVIDIA/spark-rapids/pull/11567)|Fix test case unix_timestamp(col, 'yyyyMMdd') failed for Africa/Casablanca timezone and LEGACY mode| +|[#11496](https://github.com/NVIDIA/spark-rapids/pull/11496)|Update test now that code is fixed| +|[#11548](https://github.com/NVIDIA/spark-rapids/pull/11548)|Fix negative rs. shuffle write time| +|[#11545](https://github.com/NVIDIA/spark-rapids/pull/11545)|Update test case related to LEACY datetime format to unblock nightly CI| +|[#11515](https://github.com/NVIDIA/spark-rapids/pull/11515)|Propagate default DIST_PROFILE_OPT profile to Maven in buildall| +|[#11497](https://github.com/NVIDIA/spark-rapids/pull/11497)|Update from_json to use new cudf features| +|[#11516](https://github.com/NVIDIA/spark-rapids/pull/11516)|Deploy all submodules for default sparkver in nightly [skip ci]| +|[#11484](https://github.com/NVIDIA/spark-rapids/pull/11484)|Fix FileAlreadyExistsException in LORE dump process| +|[#11457](https://github.com/NVIDIA/spark-rapids/pull/11457)|GPU device watermark metrics| +|[#11507](https://github.com/NVIDIA/spark-rapids/pull/11507)|Replace libmamba-solver with mamba command [skip ci]| +|[#11503](https://github.com/NVIDIA/spark-rapids/pull/11503)|Download artifacts via wget [skip ci]| +|[#11490](https://github.com/NVIDIA/spark-rapids/pull/11490)|Use UnaryLike instead of UnaryExpression| +|[#10798](https://github.com/NVIDIA/spark-rapids/pull/10798)|Optimizing Expand+Aggregate in sqls with many count distinct| +|[#11366](https://github.com/NVIDIA/spark-rapids/pull/11366)|Enable parquet suites from Spark UT| +|[#11477](https://github.com/NVIDIA/spark-rapids/pull/11477)|Install cuDF-py against python 3.10 on Databricks| +|[#11462](https://github.com/NVIDIA/spark-rapids/pull/11462)|Support non-UTC timezone for casting from date type to timestamp type| +|[#11449](https://github.com/NVIDIA/spark-rapids/pull/11449)|Support yyyyMMdd in GetTimestamp operator for LEGACY mode| +|[#11456](https://github.com/NVIDIA/spark-rapids/pull/11456)|Enable tests for all JSON white space normalization| +|[#11483](https://github.com/NVIDIA/spark-rapids/pull/11483)|Use reusable auto-merge workflow [skip ci]| +|[#11482](https://github.com/NVIDIA/spark-rapids/pull/11482)|Fix a json test for non utc time zone| +|[#11464](https://github.com/NVIDIA/spark-rapids/pull/11464)|Use improved CUDF JSON validation| +|[#11474](https://github.com/NVIDIA/spark-rapids/pull/11474)|Enable tests after string_split was fixed| +|[#11473](https://github.com/NVIDIA/spark-rapids/pull/11473)|Revert "Skip test_hash_groupby_approx_percentile byte and double test…| +|[#11466](https://github.com/NVIDIA/spark-rapids/pull/11466)|Replace scala.util.Try with a try statement in the DBR buildinfo| +|[#11469](https://github.com/NVIDIA/spark-rapids/pull/11469)|Skip test_hash_groupby_approx_percentile byte and double tests tempor…| +|[#11429](https://github.com/NVIDIA/spark-rapids/pull/11429)|Fixed some of the failing parquet_tests| +|[#11455](https://github.com/NVIDIA/spark-rapids/pull/11455)|Log DBR BuildInfo| +|[#11451](https://github.com/NVIDIA/spark-rapids/pull/11451)|xfail array and map cast to string tests| +|[#11331](https://github.com/NVIDIA/spark-rapids/pull/11331)|Add companion metrics for all nsTiming metrics without semaphore| +|[#11421](https://github.com/NVIDIA/spark-rapids/pull/11421)|[DOC] remove the redundant archive link [skip ci]| +|[#11308](https://github.com/NVIDIA/spark-rapids/pull/11308)|Dynamic Shim Detection for `build` Process| +|[#11427](https://github.com/NVIDIA/spark-rapids/pull/11427)|Update CI scripts to work with the "Dynamic Shim Detection" change [skip ci]| +|[#11425](https://github.com/NVIDIA/spark-rapids/pull/11425)|Update signoff usage [skip ci]| +|[#11420](https://github.com/NVIDIA/spark-rapids/pull/11420)|Add in array_join support| +|[#11418](https://github.com/NVIDIA/spark-rapids/pull/11418)|stop using copyWithBooleanColumnAsValidity| +|[#11411](https://github.com/NVIDIA/spark-rapids/pull/11411)|Fix asymmetric join crash when stream side is empty| +|[#11395](https://github.com/NVIDIA/spark-rapids/pull/11395)|Fix a Pandas UDF slowness issue| +|[#11371](https://github.com/NVIDIA/spark-rapids/pull/11371)|Support MinBy and MaxBy for non-float ordering| +|[#11399](https://github.com/NVIDIA/spark-rapids/pull/11399)|stop using copyWithBooleanColumnAsValidity| +|[#11389](https://github.com/NVIDIA/spark-rapids/pull/11389)|prevent duplicate queueing in the prio semaphore| +|[#11291](https://github.com/NVIDIA/spark-rapids/pull/11291)|Add distinct join support for right outer joins| +|[#11396](https://github.com/NVIDIA/spark-rapids/pull/11396)|Drop cudf-py python 3.9 support [skip ci]| +|[#11393](https://github.com/NVIDIA/spark-rapids/pull/11393)|Revert work-around for empty split-string| +|[#11334](https://github.com/NVIDIA/spark-rapids/pull/11334)|Add support for Spark 3.5.2| +|[#11388](https://github.com/NVIDIA/spark-rapids/pull/11388)|JSON tests for corrected date, timestamp, and mixed types| +|[#11375](https://github.com/NVIDIA/spark-rapids/pull/11375)|Fix spark400 build in datagen and tests| +|[#11376](https://github.com/NVIDIA/spark-rapids/pull/11376)|Create a PrioritySemaphore to back the GpuSemaphore| +|[#11383](https://github.com/NVIDIA/spark-rapids/pull/11383)|Fix nightly snapshots being downloaded in premerge build| +|[#11368](https://github.com/NVIDIA/spark-rapids/pull/11368)|Move SparkRapidsBuildInfoEvent to its own file| +|[#11329](https://github.com/NVIDIA/spark-rapids/pull/11329)|Change reference to `MapUtils` into `JSONUtils`| +|[#11365](https://github.com/NVIDIA/spark-rapids/pull/11365)|Set numRows for the ColumnBatch created in GpuBringBackToHost| +|[#11363](https://github.com/NVIDIA/spark-rapids/pull/11363)|Fix failing test compile for Spark 4.0.0| +|[#11362](https://github.com/NVIDIA/spark-rapids/pull/11362)|Add tests for repeated JSON columns/keys| +|[#11321](https://github.com/NVIDIA/spark-rapids/pull/11321)|conform dependency list in 341db to previous versions style| +|[#10604](https://github.com/NVIDIA/spark-rapids/pull/10604)|Add string escaping JSON tests to the test_json_matrix| +|[#11328](https://github.com/NVIDIA/spark-rapids/pull/11328)|Swap build side for outer joins when natural build side is explosive| +|[#11358](https://github.com/NVIDIA/spark-rapids/pull/11358)|Fix download doc [skip ci]| +|[#11357](https://github.com/NVIDIA/spark-rapids/pull/11357)|Fix auto merge conflict 11354 [skip ci]| +|[#11347](https://github.com/NVIDIA/spark-rapids/pull/11347)|Revert "Fix the mismatching default configs in integration tests (#11283)"| +|[#11323](https://github.com/NVIDIA/spark-rapids/pull/11323)|replace inputFiles with location.rootPaths.toString| +|[#11340](https://github.com/NVIDIA/spark-rapids/pull/11340)|Audit script - Check commits from sql-hive directory [skip ci]| +|[#11283](https://github.com/NVIDIA/spark-rapids/pull/11283)|Fix the mismatching default configs in integration tests | +|[#11327](https://github.com/NVIDIA/spark-rapids/pull/11327)|Make hive column matches not case-sensitive| +|[#11324](https://github.com/NVIDIA/spark-rapids/pull/11324)|Append ustcfy to blossom-ci whitelist [skip ci]| +|[#11325](https://github.com/NVIDIA/spark-rapids/pull/11325)|Fix auto merge conflict 11317 [skip ci]| +|[#11319](https://github.com/NVIDIA/spark-rapids/pull/11319)|Update passing JSON tests after list support added in CUDF| +|[#11307](https://github.com/NVIDIA/spark-rapids/pull/11307)|Safely close multiple resources in RapidsBufferCatalog| +|[#11313](https://github.com/NVIDIA/spark-rapids/pull/11313)|Fix auto merge conflict 10845 11310 [skip ci]| +|[#11312](https://github.com/NVIDIA/spark-rapids/pull/11312)|Add jihoonson as an authorized user for blossom-ci [skip ci]| +|[#11302](https://github.com/NVIDIA/spark-rapids/pull/11302)|Fix display issue of lore.md| +|[#11301](https://github.com/NVIDIA/spark-rapids/pull/11301)|Skip deploying non-critical intermediate artifacts [skip ci]| +|[#11299](https://github.com/NVIDIA/spark-rapids/pull/11299)|Enable get_json_object by default and remove legacy version| +|[#11289](https://github.com/NVIDIA/spark-rapids/pull/11289)|Use the new chunked API from multi-get_json_object| +|[#11295](https://github.com/NVIDIA/spark-rapids/pull/11295)|Remove redundant classes from the dist jar and unshimmed list| +|[#11284](https://github.com/NVIDIA/spark-rapids/pull/11284)|Use distinct count to estimate join magnification factor| +|[#11288](https://github.com/NVIDIA/spark-rapids/pull/11288)|Move easy unshimmed classes to sql-plugin-api| +|[#11285](https://github.com/NVIDIA/spark-rapids/pull/11285)|Remove files under tools/generated_files/spark31* [skip ci]| +|[#11280](https://github.com/NVIDIA/spark-rapids/pull/11280)|Asynchronously copy table data to the host during shuffle| +|[#11258](https://github.com/NVIDIA/spark-rapids/pull/11258)|Explicitly disable ANSI mode for ast_test.py| +|[#11267](https://github.com/NVIDIA/spark-rapids/pull/11267)|Update the rapids JNI and private dependency version to 24.10.0-SNAPSHOT| +|[#11241](https://github.com/NVIDIA/spark-rapids/pull/11241)|Auto merge PRs to branch-24.10 from branch-24.08 [skip ci]| +|[#11231](https://github.com/NVIDIA/spark-rapids/pull/11231)|Cache dependencies for scala 2.13 [skip ci]| ## Release 24.08 @@ -88,8 +242,11 @@ Generated on 2024-08-18 ### PRs ||| |:---|:---| +|[#11400](https://github.com/NVIDIA/spark-rapids/pull/11400)|[DOC] update notes in download page for the decompressing gzip issue [skip ci]| +|[#11355](https://github.com/NVIDIA/spark-rapids/pull/11355)|Update changelog for the v24.08 release [skip ci]| |[#11353](https://github.com/NVIDIA/spark-rapids/pull/11353)|Update download doc for v24.08.1 [skip ci]| |[#11352](https://github.com/NVIDIA/spark-rapids/pull/11352)|Update version to 24.08.1-SNAPSHOT [skip ci]| +|[#11337](https://github.com/NVIDIA/spark-rapids/pull/11337)|Update changelog for the v24.08 release [skip ci]| |[#11335](https://github.com/NVIDIA/spark-rapids/pull/11335)|Fix Delta Lake truncation of min/max string values| |[#11304](https://github.com/NVIDIA/spark-rapids/pull/11304)|Update changelog for v24.08.0 release [skip ci]| |[#11303](https://github.com/NVIDIA/spark-rapids/pull/11303)|Update rapids JNI and private dependency to 24.08.0| @@ -205,127 +362,5 @@ Generated on 2024-08-18 |[#10933](https://github.com/NVIDIA/spark-rapids/pull/10933)|Fixed Databricks build| |[#10929](https://github.com/NVIDIA/spark-rapids/pull/10929)|Append new authorized user to blossom-ci whitelist [skip ci]| -## Release 24.06 - -### Features -||| -|:---|:---| -|[#10850](https://github.com/NVIDIA/spark-rapids/issues/10850)|[FEA] Refine the test framework introduced in #10745| -|[#6969](https://github.com/NVIDIA/spark-rapids/issues/6969)|[FEA] Support parse_url | -|[#10496](https://github.com/NVIDIA/spark-rapids/issues/10496)|[FEA] Drop support for CentOS7| -|[#10760](https://github.com/NVIDIA/spark-rapids/issues/10760)|[FEA]Support ArrayFilter| -|[#10721](https://github.com/NVIDIA/spark-rapids/issues/10721)|[FEA] Dump the complete set of build-info properties to the Spark eventLog| -|[#10666](https://github.com/NVIDIA/spark-rapids/issues/10666)|[FEA] Create Spark 3.4.3 shim| - -### Performance -||| -|:---|:---| -|[#8963](https://github.com/NVIDIA/spark-rapids/issues/8963)|[FEA] Use custom kernel for parse_url| -|[#10817](https://github.com/NVIDIA/spark-rapids/issues/10817)|[FOLLOW ON] Combining regex parsing in transpiling and regex rewrite in `rlike`| -|[#10821](https://github.com/NVIDIA/spark-rapids/issues/10821)|Rewrite `pattern[A-B]{X,Y}` (a pattern string followed by X to Y chars in range A - B) in `RLIKE` to a custom kernel| - -### Bugs Fixed -||| -|:---|:---| -|[#10928](https://github.com/NVIDIA/spark-rapids/issues/10928)|[BUG] 24.06 test_conditional_with_side_effects_case_when test failed on Scala 2.13 with DATAGEN_SEED=1716656294| -|[#10941](https://github.com/NVIDIA/spark-rapids/issues/10941)|[BUG] Failed to build on databricks due to GpuOverrides.scala:4264: not found: type GpuSubqueryBroadcastMeta| -|[#10902](https://github.com/NVIDIA/spark-rapids/issues/10902)|Spark UT failed: SPARK-37360: Timestamp type inference for a mix of TIMESTAMP_NTZ and TIMESTAMP_LTZ| -|[#10899](https://github.com/NVIDIA/spark-rapids/issues/10899)|[BUG] format_number Spark UT failed because Type conversion is not allowed| -|[#10913](https://github.com/NVIDIA/spark-rapids/issues/10913)|[BUG] rlike with empty pattern failed with 'NoSuchElementException' when enabling regex rewrite| -|[#10774](https://github.com/NVIDIA/spark-rapids/issues/10774)|[BUG] Issues found by Spark UT Framework on RapidsRegexpExpressionsSuite| -|[#10606](https://github.com/NVIDIA/spark-rapids/issues/10606)|[BUG] Update Plugin to use the new `getPartitionedFile` method| -|[#10806](https://github.com/NVIDIA/spark-rapids/issues/10806)|[BUG] orc_write_test.py::test_write_round_trip_corner failed with DATAGEN_SEED=1715517863| -|[#10831](https://github.com/NVIDIA/spark-rapids/issues/10831)|[BUG] Failed to read data from iceberg| -|[#10810](https://github.com/NVIDIA/spark-rapids/issues/10810)|[BUG] NPE when running `ParseUrl` tests in `RapidsStringExpressionsSuite`| -|[#10797](https://github.com/NVIDIA/spark-rapids/issues/10797)|[BUG] udf_test test_single_aggregate_udf, test_group_aggregate_udf and test_group_apply_udf_more_types failed on DB 13.3| -|[#10719](https://github.com/NVIDIA/spark-rapids/issues/10719)|[BUG] test_exact_percentile_groupby FAILED: hash_aggregate_test.py::test_exact_percentile_groupby with DATAGEN seed 1713362217| -|[#10738](https://github.com/NVIDIA/spark-rapids/issues/10738)|[BUG] test_exact_percentile_groupby_partial_fallback_to_cpu failed with DATAGEN_SEED=1713928179| -|[#10768](https://github.com/NVIDIA/spark-rapids/issues/10768)|[DOC] Dead links with tools pages| -|[#10751](https://github.com/NVIDIA/spark-rapids/issues/10751)|[BUG] Cascaded Pandas UDFs not working as expected on Databricks when plugin is enabled| -|[#10318](https://github.com/NVIDIA/spark-rapids/issues/10318)|[BUG] `fs.azure.account.keyInvalid` configuration issue while reading from Unity Catalog Tables on Azure DB| -|[#10722](https://github.com/NVIDIA/spark-rapids/issues/10722)|[BUG] "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| -|[#10724](https://github.com/NVIDIA/spark-rapids/issues/10724)|[BUG] Failed to convert string with invisible characters to float| -|[#10633](https://github.com/NVIDIA/spark-rapids/issues/10633)|[BUG] ScanJson and JsonToStructs can give almost random errors| -|[#10659](https://github.com/NVIDIA/spark-rapids/issues/10659)|[BUG] from_json ArrayIndexOutOfBoundsException in 24.02| -|[#10656](https://github.com/NVIDIA/spark-rapids/issues/10656)|[BUG] Databricks cache tests failing with host memory OOM| - -### PRs -||| -|:---|:---| -|[#11222](https://github.com/NVIDIA/spark-rapids/pull/11222)|Update change log for v24.06.1 release [skip ci]| -|[#11221](https://github.com/NVIDIA/spark-rapids/pull/11221)|Change cudf version back to 24.06.0-SNAPSHOT [skip ci]| -|[#11217](https://github.com/NVIDIA/spark-rapids/pull/11217)|Update latest changelog [skip ci]| -|[#11211](https://github.com/NVIDIA/spark-rapids/pull/11211)|Use fixed seed for test_from_json_struct_decimal| -|[#11203](https://github.com/NVIDIA/spark-rapids/pull/11203)|Update version to 24.06.1-SNAPSHOT| -|[#11205](https://github.com/NVIDIA/spark-rapids/pull/11205)|Update docs for 24.06.1 release [skip ci]| -|[#11056](https://github.com/NVIDIA/spark-rapids/pull/11056)|Update latest changelog [skip ci]| -|[#11052](https://github.com/NVIDIA/spark-rapids/pull/11052)|Add spark343 shim for scala2.13 dist jar| -|[#10981](https://github.com/NVIDIA/spark-rapids/pull/10981)|Update latest changelog [skip ci]| -|[#10984](https://github.com/NVIDIA/spark-rapids/pull/10984)|[DOC] Update docs for 24.06.0 release [skip ci]| -|[#10974](https://github.com/NVIDIA/spark-rapids/pull/10974)|Update rapids JNI and private dependency to 24.06.0| -|[#10830](https://github.com/NVIDIA/spark-rapids/pull/10830)|Use ErrorClass to Throw AnalysisException| -|[#10947](https://github.com/NVIDIA/spark-rapids/pull/10947)|Prevent contains-PrefixRange optimization if not preceded by wildcards| -|[#10934](https://github.com/NVIDIA/spark-rapids/pull/10934)|Revert "Add Support for Multiple Filtering Keys for Subquery Broadcast "| -|[#10870](https://github.com/NVIDIA/spark-rapids/pull/10870)|Add support for self-contained profiling| -|[#10903](https://github.com/NVIDIA/spark-rapids/pull/10903)|Use upper case for LEGACY_TIME_PARSER_POLICY to fix a spark UT| -|[#10900](https://github.com/NVIDIA/spark-rapids/pull/10900)|Fix type convert error in format_number scalar input| -|[#10868](https://github.com/NVIDIA/spark-rapids/pull/10868)|Disable default cuDF pinned pool| -|[#10914](https://github.com/NVIDIA/spark-rapids/pull/10914)|Fix NoSuchElementException when rlike with empty pattern| -|[#10858](https://github.com/NVIDIA/spark-rapids/pull/10858)|Add Support for Multiple Filtering Keys for Subquery Broadcast | -|[#10861](https://github.com/NVIDIA/spark-rapids/pull/10861)|refine ut framework including Part 1 and Part 2| -|[#10872](https://github.com/NVIDIA/spark-rapids/pull/10872)|[DOC] ignore released plugin links to reduce the bother info [skip ci]| -|[#10839](https://github.com/NVIDIA/spark-rapids/pull/10839)|Replace anonymous classes for SortOrder and FIlterExec overrides| -|[#10873](https://github.com/NVIDIA/spark-rapids/pull/10873)|Auto merge PRs to branch-24.08 from branch-24.06 [skip ci]| -|[#10860](https://github.com/NVIDIA/spark-rapids/pull/10860)|[Spark 4.0] Account for `PartitionedFileUtil.getPartitionedFile` signature change.| -|[#10822](https://github.com/NVIDIA/spark-rapids/pull/10822)|Rewrite regex pattern `literal[a-b]{x}` to custom kernel in rlike| -|[#10833](https://github.com/NVIDIA/spark-rapids/pull/10833)|Filter out unused json_path tokens| -|[#10855](https://github.com/NVIDIA/spark-rapids/pull/10855)|Fix auto merge conflict 10845 [[skip ci]]| -|[#10826](https://github.com/NVIDIA/spark-rapids/pull/10826)|Add NVTX ranges to identify Spark stages and tasks| -|[#10836](https://github.com/NVIDIA/spark-rapids/pull/10836)|Catch exceptions when trying to examine Iceberg scan for metadata queries| -|[#10824](https://github.com/NVIDIA/spark-rapids/pull/10824)|Support zstd for GPU shuffle compression| -|[#10828](https://github.com/NVIDIA/spark-rapids/pull/10828)|Added DateTimeUtilsShims [Databricks]| -|[#10829](https://github.com/NVIDIA/spark-rapids/pull/10829)|Fix `Inheritance Shadowing` to add support for Spark 4.0.0| -|[#10811](https://github.com/NVIDIA/spark-rapids/pull/10811)|Fix NPE in GpuParseUrl for null keys.| -|[#10723](https://github.com/NVIDIA/spark-rapids/pull/10723)|Implement chunked ORC reader| -|[#10715](https://github.com/NVIDIA/spark-rapids/pull/10715)|Rewrite some rlike expression to StartsWith/Contains| -|[#10820](https://github.com/NVIDIA/spark-rapids/pull/10820)|workaround #10801 temporally| -|[#10812](https://github.com/NVIDIA/spark-rapids/pull/10812)|Replace ThreadPoolExecutor creation with ThreadUtils API| -|[#10813](https://github.com/NVIDIA/spark-rapids/pull/10813)|Fix the errors for Pandas UDF tests on DB13.3| -|[#10795](https://github.com/NVIDIA/spark-rapids/pull/10795)|Remove fixed seed for exact `percentile` integration tests| -|[#10805](https://github.com/NVIDIA/spark-rapids/pull/10805)|Drop Support for CentOS 7| -|[#10800](https://github.com/NVIDIA/spark-rapids/pull/10800)|Add number normalization test and address followup for getJsonObject| -|[#10796](https://github.com/NVIDIA/spark-rapids/pull/10796)|fixing build break on DBR| -|[#10791](https://github.com/NVIDIA/spark-rapids/pull/10791)|Fix auto merge conflict 10779 [skip ci]| -|[#10636](https://github.com/NVIDIA/spark-rapids/pull/10636)|Update actions version [skip ci]| -|[#10743](https://github.com/NVIDIA/spark-rapids/pull/10743)|initial PR for the framework reusing Vanilla Spark's unit tests| -|[#10767](https://github.com/NVIDIA/spark-rapids/pull/10767)|Add rows-only batches support to RebatchingRoundoffIterator| -|[#10763](https://github.com/NVIDIA/spark-rapids/pull/10763)|Add in the GpuArrayFilter command| -|[#10766](https://github.com/NVIDIA/spark-rapids/pull/10766)|Fix dead links related to tools documentation [skip ci]| -|[#10644](https://github.com/NVIDIA/spark-rapids/pull/10644)|Add logging to Integration test runs in local and local-cluster mode| -|[#10756](https://github.com/NVIDIA/spark-rapids/pull/10756)|Fix Authorization Failure While Reading Tables From Unity Catalog| -|[#10752](https://github.com/NVIDIA/spark-rapids/pull/10752)|Add SparkRapidsBuildInfoEvent to the event log| -|[#10754](https://github.com/NVIDIA/spark-rapids/pull/10754)|Substitute whoami for $USER| -|[#10755](https://github.com/NVIDIA/spark-rapids/pull/10755)|[DOC] Update README for prioritize-commits script [skip ci]| -|[#10728](https://github.com/NVIDIA/spark-rapids/pull/10728)|Let big data gen set nullability recursively| -|[#10740](https://github.com/NVIDIA/spark-rapids/pull/10740)|Use parse_url kernel for PATH parsing| -|[#10734](https://github.com/NVIDIA/spark-rapids/pull/10734)|Add short circuit path for get-json-object when there is separate wildcard path| -|[#10725](https://github.com/NVIDIA/spark-rapids/pull/10725)|Initial definition for Spark 4.0.0 shim| -|[#10635](https://github.com/NVIDIA/spark-rapids/pull/10635)|Use new getJsonObject kernel for json_tuple| -|[#10739](https://github.com/NVIDIA/spark-rapids/pull/10739)|Use fixed seed for some random failed tests| -|[#10720](https://github.com/NVIDIA/spark-rapids/pull/10720)|Add Shims for Spark 3.4.3| -|[#10716](https://github.com/NVIDIA/spark-rapids/pull/10716)|Remove the mixedType config for JSON as it has no downsides any longer| -|[#10733](https://github.com/NVIDIA/spark-rapids/pull/10733)|Fix "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| -|[#10718](https://github.com/NVIDIA/spark-rapids/pull/10718)|Change parameters for memory limit in Parquet chunked reader| -|[#10292](https://github.com/NVIDIA/spark-rapids/pull/10292)|Upgrade to UCX 1.16.0| -|[#10709](https://github.com/NVIDIA/spark-rapids/pull/10709)|Removing some authorizations for departed users [skip ci]| -|[#10726](https://github.com/NVIDIA/spark-rapids/pull/10726)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#10708](https://github.com/NVIDIA/spark-rapids/pull/10708)|Updated dump tool to verify get_json_object| -|[#10706](https://github.com/NVIDIA/spark-rapids/pull/10706)|Fix auto merge conflict 10704 [skip ci]| -|[#10675](https://github.com/NVIDIA/spark-rapids/pull/10675)|Fix merge conflict with branch-24.04 [skip ci]| -|[#10678](https://github.com/NVIDIA/spark-rapids/pull/10678)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#10662](https://github.com/NVIDIA/spark-rapids/pull/10662)|Audit script - Check commits from shuffle and storage directories [skip ci]| -|[#10655](https://github.com/NVIDIA/spark-rapids/pull/10655)|Update rapids jni/private dependency to 24.06| -|[#10652](https://github.com/NVIDIA/spark-rapids/pull/10652)|Substitute murmurHash32 for spark32BitMurmurHash3| - ## Older Releases Changelog of older releases can be found at [docs/archives](/docs/archives) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 430af075782..83b30747abd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/README.md b/README.md index 789219dde27..32ded1484a3 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index d9a8380c643..c97a4209545 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT aggregator diff --git a/api_validation/pom.xml b/api_validation/pom.xml index 4eec0e2ab02..cddcf0c1ce1 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT api_validation diff --git a/build/buildall b/build/buildall index f9ac0507922..0599d080054 100755 --- a/build/buildall +++ b/build/buildall @@ -222,6 +222,8 @@ if [[ "$GEN_BLOOP" == "true" ]]; then exit 0 fi +[[ "$DIST_PROFILE" != "" ]] && MVN_PROFILE_OPT="-P$DIST_PROFILE" || MVN_PROFILE_OPT="" + # First element in SPARK_SHIM_VERSIONS to do most of the checks export BASE_VER=${SPARK_SHIM_VERSIONS[0]} export NUM_SHIMS=${#SPARK_SHIM_VERSIONS[@]} @@ -305,7 +307,7 @@ time ( # a negligible increase of the build time by ~2 seconds. joinShimBuildFrom="aggregator" echo "Resuming from $joinShimBuildFrom build only using $BASE_VER" - $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $INCLUDED_BUILDVERS_OPT \ + $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $MVN_PROFILE_OPT $INCLUDED_BUILDVERS_OPT \ -Dbuildver="$BASE_VER" \ -DskipTests -Dmaven.scaladoc.skip ) diff --git a/datagen/README.md b/datagen/README.md index 300b5d331c0..022cc2f1eba 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.10.0 release would be -`target/datagen_2.12-24.10.0-spark330.jar` +for example a Spark 3.3.0 jar for the 24.12.0 release would be +`target/datagen_2.12-24.12.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.10.0-spark330.jar +spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index a88bd8c2836..a728ad9a13e 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.10.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-24.12.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index ccdd6f90fe5..9bdf897cfd7 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,18 +21,19 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT datagen **/* package + ${project.build.outputDirectory}/datagen-version-info.properties diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index 35760b28288..1d41911c767 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 3c671173415..7514088ca3a 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index 1e576f97e98..2ed0ea3b159 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index f0a488cb5f2..1daebdd0efb 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 883e9de2933..36ec92b70c0 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index 2c086610bbc..95f54c6807c 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index d7763b00a00..4d792ee1ca5 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index ff8b8da6bf0..4b229e2e5b5 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT false diff --git a/delta-lake/delta-spark350db/pom.xml b/delta-lake/delta-spark350db/pom.xml index 122ad171a26..6640d727507 100644 --- a/delta-lake/delta-spark350db/pom.xml +++ b/delta-lake/delta-spark350db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark350db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT false diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index e1c841cd9c9..6d0471f9f01 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/df_udf/README.md b/df_udf/README.md new file mode 100644 index 00000000000..0226c365a42 --- /dev/null +++ b/df_udf/README.md @@ -0,0 +1,90 @@ +# Scala / Java UDFS implemented using data frame + +User Defined Functions (UDFs) are used for a number of reasons in Apache Spark. Much of the time it is to implement +logic that is either very difficult or impossible to implement using existing SQL/Dataframe APIs directly. But they +are also used as a way to standardize processing logic across an organization or for code reused. + +But UDFs come with some downsides. The biggest one is visibility into the processing being done. SQL is a language that +can be highly optimized. But a UDF in most cases is a black box, that the SQL optimizer cannot do anything about. +This can result in less than ideal query planning. Additionally, accelerated execution environments, like the +RAPIDS Accelerator for Apache Spark have no easy way to replace UDFs with accelerated versions, which can result in +slow performance. + +This attempts to add visibility to the code reuse use case by providing a way to implement a UDF in terms of dataframe +commands. + +## Setup + +To do this include com.nvidia:df_udf_plugin as a dependency for your project and also include it on the +classpath for your Apache Spark environment. Then include `com.nvidia.spark.DFUDFPlugin` in the config +`spark.sql.extensions`. Now you can implement a UDF in terms of Dataframe operations. + +## Usage + +```scala +import com.nvidia.spark.functions._ + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions._ + +val sum_array = df_udf((longArray: Column) => + aggregate(longArray, + lit(0L), + (a, b) => coalesce(a, lit(0L)) + coalesce(b, lit(0L)), + a => a)) +spark.udf.register("sum_array", sum_array) +``` + +You can then use `sum_array` however you would have used any other UDF. This allows you to provide a drop in replacement +implementation of an existing UDF. + +```scala +Seq(Array(1L, 2L, 3L)).toDF("data").selectExpr("sum_array(data) as result").show() + ++------+ +|result| ++------+ +| 6| ++------+ +``` + +## Type Checks + +DataFrame APIs do not provide type safety when writing the code and that is the same here. There are no builtin type +checks for inputs yet. Also, because of how types are resolved in Spark there is no way to adjust the query based on +the types passed in. Type checks are handled by the SQL planner/optimizer after the UDF has been replaced. This means +that the final SQL will not violate any type safety, but it also means that the errors might be confusing. For example, +if I passed in an `ARRAY` to `sum_array` instead of an `ARRAY` I would get an error like + +```scala +Seq(Array(1.0, 2.0, 3.0)).toDF("data").selectExpr("sum_array(data) as result").show() +org.apache.spark.sql.AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "aggregate(data, 0, lambdafunction((coalesce(namedlambdavariable(), 0) + coalesce(namedlambdavariable(), 0)), namedlambdavariable(), namedlambdavariable()), lambdafunction(namedlambdavariable(), namedlambdavariable()))" due to data type mismatch: Parameter 3 requires the "BIGINT" type, however "lambdafunction((coalesce(namedlambdavariable(), 0) + coalesce(namedlambdavariable(), 0)), namedlambdavariable(), namedlambdavariable())" has the type "DOUBLE".; line 1 pos 0; +Project [aggregate(data#46, 0, lambdafunction((cast(coalesce(lambda x_9#49L, 0) as double) + coalesce(lambda y_10#50, cast(0 as double))), lambda x_9#49L, lambda y_10#50, false), lambdafunction(lambda x_11#51L, lambda x_11#51L, false)) AS result#48L] ++- Project [value#43 AS data#46] + +- LocalRelation [value#43] + + at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73) + at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5(CheckAnalysis.scala:269) + at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5$adapted(CheckAnalysis.scala:256) +``` + +Which is not as simple to understand as a normal UDF. + +```scala +val sum_array = udf((a: Array[Long]) => a.sum) + +spark.udf.register("sum_array", sum_array) + +Seq(Array(1.0, 2.0, 3.0)).toDF("data").selectExpr("sum_array(data) as result").show() +org.apache.spark.sql.AnalysisException: [CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from "DOUBLE" to "BIGINT". + The type path of the target object is: +- array element class: "long" +- root class: "[J" +You can either add an explicit cast to the input data or choose a higher precision type of the field in the target object +at org.apache.spark.sql.errors.QueryCompilationErrors$.upCastFailureError(QueryCompilationErrors.scala:285) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveUpCast$$fail(Analyzer.scala:3646) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$$anonfun$apply$57$$anonfun$applyOrElse$234.applyOrElse(Analyzer.scala:3677) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$$anonfun$apply$57$$anonfun$applyOrElse$234.applyOrElse(Analyzer.scala:3654) +``` + +We hope to add optional type checks in the future. \ No newline at end of file diff --git a/df_udf/pom.xml b/df_udf/pom.xml new file mode 100644 index 00000000000..39f33880f34 --- /dev/null +++ b/df_udf/pom.xml @@ -0,0 +1,88 @@ + + + + 4.0.0 + + com.nvidia + rapids-4-spark-shim-deps-parent_2.12 + 24.12.0-SNAPSHOT + ../shim-deps/pom.xml + + df_udf_plugin_2.12 + UDFs implemented in SQL/Dataframe + UDFs for Apache Spark implemented in SQL/Dataframe + 24.12.0-SNAPSHOT + + + df_udf + + **/* + package + ${project.build.outputDirectory}/df_udf-version-info.properties + + + + + org.scala-lang + scala-library + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.test.version} + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + true + + + + net.alchim31.maven + scala-maven-plugin + + + org.scalatest + scalatest-maven-plugin + + + org.apache.rat + apache-rat-plugin + + + + + + + ${project.build.directory}/extra-resources + + + + diff --git a/df_udf/src/main/scala/com/nvidia/spark/DFUDFPlugin.scala b/df_udf/src/main/scala/com/nvidia/spark/DFUDFPlugin.scala new file mode 100644 index 00000000000..7e1c0451c8a --- /dev/null +++ b/df_udf/src/main/scala/com/nvidia/spark/DFUDFPlugin.scala @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark + +import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule + +class DFUDFPlugin extends (SparkSessionExtensions => Unit) { + override def apply(extensions: SparkSessionExtensions): Unit = { + extensions.injectResolutionRule(logicalPlanRules) + } + + def logicalPlanRules(sparkSession: SparkSession): Rule[LogicalPlan] = { + org.apache.spark.sql.nvidia.LogicalPlanRules() + } +} \ No newline at end of file diff --git a/df_udf/src/main/scala/com/nvidia/spark/functions.scala b/df_udf/src/main/scala/com/nvidia/spark/functions.scala new file mode 100644 index 00000000000..8c8eef3f825 --- /dev/null +++ b/df_udf/src/main/scala/com/nvidia/spark/functions.scala @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark + +import org.apache.spark.sql.Column +import org.apache.spark.sql.api.java.{UDF0, UDF1, UDF10, UDF2, UDF3, UDF4, UDF5, UDF6, UDF7, UDF8, UDF9} +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.nvidia._ +import org.apache.spark.sql.types.LongType + +// scalastyle:off +object functions { +// scalastyle:on + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function0[Column]): UserDefinedFunction = + udf(DFUDF0(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function1[Column, Column]): UserDefinedFunction = + udf(DFUDF1(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function2[Column, Column, Column]): UserDefinedFunction = + udf(DFUDF2(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function3[Column, Column, Column, Column]): UserDefinedFunction = + udf(DFUDF3(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function4[Column, Column, Column, Column, Column]): UserDefinedFunction = + udf(DFUDF4(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function5[Column, Column, Column, Column, Column, Column]): UserDefinedFunction = + udf(DFUDF5(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function6[Column, Column, Column, Column, Column, Column, + Column]): UserDefinedFunction = + udf(DFUDF6(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function7[Column, Column, Column, Column, Column, Column, + Column, Column]): UserDefinedFunction = + udf(DFUDF7(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function8[Column, Column, Column, Column, Column, Column, + Column, Column, Column]): UserDefinedFunction = + udf(DFUDF8(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function9[Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column]): UserDefinedFunction = + udf(DFUDF9(f), LongType) + + /** + * Defines a Scala closure of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to + * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: Function10[Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column, Column]): UserDefinedFunction = + udf(DFUDF10(f), LongType) + + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Java UDF functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF0[Column]): UserDefinedFunction = { + udf(JDFUDF0(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF1[Column, Column]): UserDefinedFunction = { + udf(JDFUDF1(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF2[Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF2(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF3[Column, Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF3(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF4[Column, Column, Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF4(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF5[Column, Column, Column, Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF5(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF6[Column, Column, Column, Column, Column, Column, + Column]): UserDefinedFunction = { + udf(JDFUDF6(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF7[Column, Column, Column, Column, Column, Column, + Column, Column]): UserDefinedFunction = { + udf(JDFUDF7(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF8[Column, Column, Column, Column, Column, Column, + Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF8(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF9[Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF9(f), LongType) + } + + /** + * Defines a Java UDF instance of Columns as user-defined function (UDF). + * By default the returned UDF is deterministic. To change it to nondeterministic, call the + * API `UserDefinedFunction.asNondeterministic()`. + */ + def df_udf(f: UDF10[Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column, Column]): UserDefinedFunction = { + udf(JDFUDF10(f), LongType) + } + +} \ No newline at end of file diff --git a/df_udf/src/main/scala/org/apache/spark/sql/nvidia/LogicalPlanRules.scala b/df_udf/src/main/scala/org/apache/spark/sql/nvidia/LogicalPlanRules.scala new file mode 100644 index 00000000000..24a123016d6 --- /dev/null +++ b/df_udf/src/main/scala/org/apache/spark/sql/nvidia/LogicalPlanRules.scala @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.nvidia + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule + +case class LogicalPlanRules() extends Rule[LogicalPlan] with Logging { + val replacePartialFunc: PartialFunction[Expression, Expression] = { + case f: ScalaUDF if DFUDF.getDFUDF(f.function).isDefined => + DFUDF.getDFUDF(f.function).map { + dfudf => DFUDFShims.columnToExpr( + dfudf(f.children.map(DFUDFShims.exprToColumn(_)).toArray)) + }.getOrElse{ + throw new IllegalStateException("Inconsistent results when extracting df_udf") + } + } + + override def apply(plan: LogicalPlan): LogicalPlan = + plan.transformExpressions(replacePartialFunc) +} diff --git a/df_udf/src/main/scala/org/apache/spark/sql/nvidia/dataframe_udfs.scala b/df_udf/src/main/scala/org/apache/spark/sql/nvidia/dataframe_udfs.scala new file mode 100644 index 00000000000..79f71ba4ca0 --- /dev/null +++ b/df_udf/src/main/scala/org/apache/spark/sql/nvidia/dataframe_udfs.scala @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.nvidia + +import java.lang.invoke.SerializedLambda + +import org.apache.spark.sql.Column +import org.apache.spark.sql.api.java._ +import org.apache.spark.util.Utils + +trait DFUDF { + def apply(input: Array[Column]): Column +} + +case class DFUDF0(f: Function0[Column]) + extends UDF0[Any] with DFUDF { + override def call(): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 0) + f() + } +} + +case class DFUDF1(f: Function1[Column, Column]) + extends UDF1[Any, Any] with DFUDF { + override def call(t1: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 1) + f(input(0)) + } +} + +case class DFUDF2(f: Function2[Column, Column, Column]) + extends UDF2[Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 2) + f(input(0), input(1)) + } +} + +case class DFUDF3(f: Function3[Column, Column, Column, Column]) + extends UDF3[Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 3) + f(input(0), input(1), input(2)) + } +} + +case class DFUDF4(f: Function4[Column, Column, Column, Column, Column]) + extends UDF4[Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 4) + f(input(0), input(1), input(2), input(3)) + } +} + +case class DFUDF5(f: Function5[Column, Column, Column, Column, Column, Column]) + extends UDF5[Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 5) + f(input(0), input(1), input(2), input(3), input(4)) + } +} + +case class DFUDF6(f: Function6[Column, Column, Column, Column, Column, Column, Column]) + extends UDF6[Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 6) + f(input(0), input(1), input(2), input(3), input(4), input(5)) + } +} + +case class DFUDF7(f: Function7[Column, Column, Column, Column, Column, Column, Column, Column]) + extends UDF7[Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 7) + f(input(0), input(1), input(2), input(3), input(4), input(5), input(6)) + } +} + +case class DFUDF8(f: Function8[Column, Column, Column, Column, Column, Column, Column, Column, + Column]) + extends UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 8) + f(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7)) + } +} + +case class DFUDF9(f: Function9[Column, Column, Column, Column, Column, Column, Column, Column, + Column, Column]) + extends UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any, + t9: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 9) + f(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7), input(8)) + } +} + +case class DFUDF10(f: Function10[Column, Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column]) + extends UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any, + t9: Any, t10: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 10) + f(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7), input(8), + input(9)) + } +} + +case class JDFUDF0(f: UDF0[Column]) + extends UDF0[Any] with DFUDF { + override def call(): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 0) + f.call() + } +} + +case class JDFUDF1(f: UDF1[Column, Column]) + extends UDF1[Any, Any] with DFUDF { + override def call(t1: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 1) + f.call(input(0)) + } +} + +case class JDFUDF2(f: UDF2[Column, Column, Column]) + extends UDF2[Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 2) + f.call(input(0), input(1)) + } +} + +case class JDFUDF3(f: UDF3[Column, Column, Column, Column]) + extends UDF3[Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 3) + f.call(input(0), input(1), input(2)) + } +} + +case class JDFUDF4(f: UDF4[Column, Column, Column, Column, Column]) + extends UDF4[Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 4) + f.call(input(0), input(1), input(2), input(3)) + } +} + +case class JDFUDF5(f: UDF5[Column, Column, Column, Column, Column, Column]) + extends UDF5[Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 5) + f.call(input(0), input(1), input(2), input(3), input(4)) + } +} + +case class JDFUDF6(f: UDF6[Column, Column, Column, Column, Column, Column, Column]) + extends UDF6[Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 6) + f.call(input(0), input(1), input(2), input(3), input(4), input(5)) + } +} + +case class JDFUDF7(f: UDF7[Column, Column, Column, Column, Column, Column, Column, Column]) + extends UDF7[Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 7) + f.call(input(0), input(1), input(2), input(3), input(4), input(5), input(6)) + } +} + +case class JDFUDF8(f: UDF8[Column, Column, Column, Column, Column, Column, Column, Column, + Column]) + extends UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 8) + f.call(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7)) + } +} + +case class JDFUDF9(f: UDF9[Column, Column, Column, Column, Column, Column, Column, Column, + Column, Column]) + extends UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any, + t9: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 9) + f.call(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7), input(8)) + } +} + +case class JDFUDF10(f: UDF10[Column, Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column]) + extends UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] with DFUDF { + override def call(t1: Any, t2: Any, t3: Any, t4: Any, t5: Any, t6: Any, t7: Any, t8: Any, + t9: Any, t10: Any): Any = { + throw new IllegalStateException("TODO better error message. This should have been replaced") + } + + override def apply(input: Array[Column]): Column = { + assert(input.length == 10) + f.call(input(0), input(1), input(2), input(3), input(4), input(5), input(6), input(7), input(8), + input(9)) + } +} + +object DFUDF { + /** + * Determine if the UDF function implements the DFUDF. + */ + def getDFUDF(function: AnyRef): Option[DFUDF] = { + function match { + case f: DFUDF => Some(f) + case f => + try { + // This may be a lambda that Spark's UDFRegistration wrapped around a Java UDF instance. + val clazz = f.getClass + if (Utils.getSimpleName(clazz).toLowerCase().contains("lambda")) { + // Try to find a `writeReplace` method, further indicating it is likely a lambda + // instance, and invoke it to serialize the lambda. Once serialized, captured arguments + // can be examined to locate the Java UDF instance. + // Note this relies on implementation details of Spark's UDFRegistration class. + val writeReplace = clazz.getDeclaredMethod("writeReplace") + writeReplace.setAccessible(true) + val serializedLambda = writeReplace.invoke(f).asInstanceOf[SerializedLambda] + if (serializedLambda.getCapturedArgCount == 1) { + serializedLambda.getCapturedArg(0) match { + case c: DFUDF => Some(c) + case _ => None + } + } else { + None + } + } else { + None + } + } catch { + case _: ClassCastException | _: NoSuchMethodException | _: SecurityException => None + } + } + } +} diff --git a/df_udf/src/main/spark320/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala b/df_udf/src/main/spark320/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala new file mode 100644 index 00000000000..98124a984a5 --- /dev/null +++ b/df_udf/src/main/spark320/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "350db"} +{"spark": "351"} +{"spark": "352"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.nvidia + +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.Expression + +object DFUDFShims { + def columnToExpr(c: Column): Expression = c.expr + def exprToColumn(e: Expression): Column = Column(e) +} diff --git a/df_udf/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala b/df_udf/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala new file mode 100644 index 00000000000..e67dfb450d8 --- /dev/null +++ b/df_udf/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.nvidia + +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} + +object DFUDFShims { + def columnToExpr(c: Column): Expression = c + def exprToColumn(e: Expression): Column = e +} diff --git a/df_udf/src/test/scala/com/nvidia/spark/functionsSuite.scala b/df_udf/src/test/scala/com/nvidia/spark/functionsSuite.scala new file mode 100644 index 00000000000..ae6d46aefdf --- /dev/null +++ b/df_udf/src/test/scala/com/nvidia/spark/functionsSuite.scala @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark + +import com.nvidia.spark.functions._ + +import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.api.java._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.nvidia.SparkTestBase +import org.apache.spark.sql.types._ + +class functionsSuite extends SparkTestBase { + test("basic 0 arg df_udf") { + val zero = df_udf(() => lit(0)) + withSparkSession{ spark => + spark.udf.register("zero", zero) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).selectExpr("id", "zero()").collect()) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).select(col("id"), zero()).collect()) + } + } + + test("basic 1 arg df_udf") { + val inc = df_udf((input: Column) => input + 1) + withSparkSession { spark => + spark.udf.register("inc", inc) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "inc(id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).select(col("id"), inc(col("id"))).collect()) + } + } + + + test("basic 2 arg df_udf") { + val add = df_udf((a: Column, b:Column) => a + b) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "add(id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).select(col("id"), add(col("id"), col("id"))).collect()) + } + } + + test("basic 3 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column) => a + b + c) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).selectExpr("id", "add(id, id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), col("id"))).collect()) + } + } + + test("basic 4 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column) => a + b + c + d) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), col("id"))).collect()) + } + } + + test("basic 5 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column) => + a + b + c + d + e) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1))).collect()) + } + } + + test("basic 6 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, f:Column) => + a + b + c + d + e + f) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"))).collect()) + } + } + + test("basic 7 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column) => a + b + c + d + e + f + g) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"))).collect()) + } + } + + test("basic 8 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column) => a + b + c + d + e + f + g + h) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2))).collect()) + } + } + + test("basic 9 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column) => + a + b + c + d + e + f + g + h + i) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"))).collect()) + } + } + + test("basic 10 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column, j:Column) => + a + b + c + d + e + f + g + h + i + j) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"), col("id"))).collect()) + } + } + + test("nested df_udf") { + val add = df_udf((a: Column, b:Column) => a + b) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 22L), + Row(1L, 25L)), + spark.range(2).selectExpr("id", "add(add(id, 12), add(add(id, id), 10))").collect()) + } + } + + test("complex df_udf") { + val extractor = df_udf((json: Column) => { + val schema = StructType(Seq(StructField("values", ArrayType(LongType)))) + val extracted_json = from_json(json, schema, Map.empty[String, String]) + aggregate(extracted_json("values"), + lit(0L), + (a, b) => coalesce(a, lit(0L)) + coalesce(b, lit(0L)), + a => a) + }) + withSparkSession { spark => + import spark.implicits._ + spark.udf.register("extractor", extractor) + assertSame(Array( + Row(6L), + Row(3L)), + Seq("""{"values":[1,2,3]}""", + """{"values":[1, null, null, 2]}""").toDF("json").selectExpr("extractor(json)").collect()) + } + } + + test("j basic 0 arg df_udf") { + val zero = df_udf(new UDF0[Column] { + override def call(): Column = lit(0) + }) + withSparkSession{ spark => + spark.udf.register("zero", zero) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).selectExpr("id", "zero()").collect()) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).select(col("id"), zero()).collect()) + } + } + + test("jbasic 1 arg df_udf") { + val inc = df_udf(new UDF1[Column, Column] { + override def call(a: Column): Column = a + 1 + }) + withSparkSession { spark => + spark.udf.register("inc", inc) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "inc(id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).select(col("id"), inc(col("id"))).collect()) + } + } + + test("jbasic 2 arg df_udf") { + val add = df_udf(new UDF2[Column, Column, Column] { + override def call(a: Column, b:Column): Column = a + b + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "add(id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).select(col("id"), add(col("id"), col("id"))).collect()) + } + } + + test("jbasic 3 arg df_udf") { + val add = df_udf(new UDF3[Column, Column, Column, Column] { + override def call(a: Column, b: Column, c: Column): Column = a + b + c + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).selectExpr("id", "add(id, id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), col("id"))).collect()) + } + } + + test("jbasic 4 arg df_udf") { + val add = df_udf(new UDF4[Column, Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column): Column = a + b + c + d + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), col("id"))).collect()) + } + } + + test("jbasic 5 arg df_udf") { + val add = df_udf(new UDF5[Column, Column, Column, Column, Column, Column] { + override def call(a: Column, b: Column, c: Column, d: Column, e: Column): Column = + a + b + c + d + e + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1))).collect()) + } + } + + test("jbasic 6 arg df_udf") { + val add = df_udf(new UDF6[Column, Column, Column, Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, f:Column) = + a + b + c + d + e + f + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"))).collect()) + } + } + + test("jbasic 7 arg df_udf") { + val add = df_udf(new UDF7[Column, Column, Column, Column, Column, Column, Column, + Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column): Column = a + b + c + d + e + f + g + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"))).collect()) + } + } + + test("jbasic 8 arg df_udf") { + val add = df_udf(new UDF8[Column, Column, Column, Column, Column, Column, Column, + Column, Column] { + override def call(a: Column, b: Column, c: Column, d: Column, e: Column, + f: Column, g: Column, h: Column): Column = a + b + c + d + e + f + g + h + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2))).collect()) + } + } + + test("jbasic 9 arg df_udf") { + val add = df_udf(new UDF9[Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column): Column = + a + b + c + d + e + f + g + h + i + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"))).collect()) + } + } + + test("jbasic 10 arg df_udf") { + val add = df_udf(new UDF10[Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column, j:Column): Column = + a + b + c + d + e + f + g + h + i + j + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"), col("id"))).collect()) + } + } +} \ No newline at end of file diff --git a/df_udf/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala b/df_udf/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala new file mode 100644 index 00000000000..2bd6697ffad --- /dev/null +++ b/df_udf/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.nvidia + +import java.io.File +import java.nio.file.Files +import java.util.{Locale, TimeZone} + +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{Row, SparkSession} + +object SparkSessionHolder extends Logging { + private var spark = createSparkSession() + private var origConf = spark.conf.getAll + private var origConfKeys = origConf.keys.toSet + + private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach { + case (key, value) if spark.conf.get(key, null) != value => + spark.conf.set(key, value) + case _ => // No need to modify it + } + + private def createSparkSession(): SparkSession = { + SparkSession.cleanupAnyExistingSession() + + TimeZone.setDefault(TimeZone.getTimeZone("UTC")) + Locale.setDefault(Locale.US) + + val builder = SparkSession.builder() + .master("local[1]") + .config("spark.sql.extensions", "com.nvidia.spark.DFUDFPlugin") + .config("spark.sql.warehouse.dir", sparkWarehouseDir.getAbsolutePath) + .appName("dataframe udf tests") + + builder.getOrCreate() + } + + private def reinitSession(): Unit = { + spark = createSparkSession() + origConf = spark.conf.getAll + origConfKeys = origConf.keys.toSet + } + + def sparkSession: SparkSession = { + if (SparkSession.getActiveSession.isEmpty) { + reinitSession() + } + spark + } + + def resetSparkSessionConf(): Unit = { + if (SparkSession.getActiveSession.isEmpty) { + reinitSession() + } else { + setAllConfs(origConf.toArray) + val currentKeys = spark.conf.getAll.keys.toSet + val toRemove = currentKeys -- origConfKeys + if (toRemove.contains("spark.shuffle.manager")) { + // cannot unset the config so need to reinitialize + reinitSession() + } else { + toRemove.foreach(spark.conf.unset) + } + } + logDebug(s"RESET CONF TO: ${spark.conf.getAll}") + } + + def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = { + resetSparkSessionConf() + logDebug(s"SETTING CONF: ${conf.getAll.toMap}") + setAllConfs(conf.getAll) + logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n") + f(spark) + } + + private lazy val sparkWarehouseDir: File = { + new File(System.getProperty("java.io.tmpdir")).mkdirs() + val path = Files.createTempDirectory("spark-warehouse") + val file = new File(path.toString) + file.deleteOnExit() + file + } +} + +/** + * Base to be able to run tests with a spark context + */ +trait SparkTestBase extends AnyFunSuite with BeforeAndAfterAll { + def withSparkSession[U](f: SparkSession => U): U = { + withSparkSession(new SparkConf, f) + } + + def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = { + SparkSessionHolder.withSparkSession(conf, f) + } + + override def afterAll(): Unit = { + super.afterAll() + SparkSession.cleanupAnyExistingSession() + } + + def assertSame(expected: Any, actual: Any, epsilon: Double = 0.0, + path: List[String] = List.empty): Unit = { + def assertDoublesAreEqualWithinPercentage(expected: Double, + actual: Double, path: List[String]): Unit = { + if (expected != actual) { + if (expected != 0) { + val v = Math.abs((expected - actual) / expected) + assert(v <= epsilon, + s"$path: ABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ") + } else { + val v = Math.abs(expected - actual) + assert(v <= epsilon, s"$path: ABS($expected - $actual) == $v is not <= $epsilon ") + } + } + } + (expected, actual) match { + case (a: Float, b: Float) if a.isNaN && b.isNaN => + case (a: Double, b: Double) if a.isNaN && b.isNaN => + case (null, null) => + case (null, other) => fail(s"$path: expected is null, but actual is $other") + case (other, null) => fail(s"$path: expected is $other, but actual is null") + case (a: Array[_], b: Array[_]) => + assert(a.length == b.length, + s"$path: expected (${a.toList}) and actual (${b.toList}) lengths don't match") + a.indices.foreach { i => + assertSame(a(i), b(i), epsilon, path :+ i.toString) + } + case (a: Map[_, _], b: Map[_, _]) => + throw new IllegalStateException(s"Maps are not supported yet for comparison $a vs $b") + case (a: Iterable[_], b: Iterable[_]) => + assert(a.size == b.size, + s"$path: expected (${a.toList}) and actual (${b.toList}) lengths don't match") + var i = 0 + a.zip(b).foreach { + case (l, r) => + assertSame(l, r, epsilon, path :+ i.toString) + i += 1 + } + case (a: Product, b: Product) => + assertSame(a.productIterator.toSeq, b.productIterator.toSeq, epsilon, path) + case (a: Row, b: Row) => + assertSame(a.toSeq, b.toSeq, epsilon, path) + // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. + case (a: Double, b: Double) if epsilon <= 0 => + java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) + case (a: Double, b: Double) if epsilon > 0 => + assertDoublesAreEqualWithinPercentage(a, b, path) + case (a: Float, b: Float) if epsilon <= 0 => + java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) + case (a: Float, b: Float) if epsilon > 0 => + assertDoublesAreEqualWithinPercentage(a, b, path) + case (a, b) => + assert(a == b, s"$path: $a != $b") + } + } +} diff --git a/dist/pom.xml b/dist/pom.xml index 84103299bdc..d628dd4ba3b 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia diff --git a/docs/archive.md b/docs/archive.md index 32d6c36ad69..641167a3488 100644 --- a/docs/archive.md +++ b/docs/archive.md @@ -5,6 +5,95 @@ nav_order: 15 --- Below are archived releases for RAPIDS Accelerator for Apache Spark. +## Release v24.08.1 +### Hardware Requirements: + +The plugin is tested on the following architectures: + + GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs + +### Software Requirements: + + OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8 + + NVIDIA Driver*: R470+ + + Runtime: + Scala 2.12, 2.13 + Python, Java Virtual Machine (JVM) compatible with your spark-version. + + * Check the Spark documentation for Python and Java version compatibility with your specific + Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1. + + Supported Spark versions: + Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 + Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4 + Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3 + Apache Spark 3.5.0, 3.5.1 + + Supported Databricks runtime versions for Azure and AWS: + Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0) + Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2) + Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1) + + Supported Dataproc versions (Debian/Ubuntu/Rocky): + GCP Dataproc 2.1 + GCP Dataproc 2.2 + + Supported Dataproc Serverless versions: + Spark runtime 1.1 LTS + Spark runtime 2.0 + Spark runtime 2.1 + Spark runtime 2.2 + +*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet +for your hardware's minimum driver version. + +*For Cloudera and EMR support, please refer to the +[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ. + +### RAPIDS Accelerator's Support Policy for Apache Spark +The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html) + +### Download RAPIDS Accelerator for Apache Spark v24.08.1 + +| Processor | Scala Version | Download Jar | Download Signature | +|-----------|---------------|--------------|--------------------| +| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1.jar.asc) | +| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1.jar.asc) | +| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1-cuda11-arm64.jar.asc) | +| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1-cuda11-arm64.jar.asc) | + +This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with +CUDA 11.8 through CUDA 12.0. + +### Verify signature +* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com). +* Import the public key: `gpg --import PUB_KEY` +* Verify the signature for Scala 2.12 jar: + `gpg --verify rapids-4-spark_2.12-24.08.1.jar.asc rapids-4-spark_2.12-24.08.1.jar` +* Verify the signature for Scala 2.13 jar: + `gpg --verify rapids-4-spark_2.13-24.08.1.jar.asc rapids-4-spark_2.13-24.08.1.jar` + +The output of signature verify: + + gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) " + +### Release Notes +* Support timezones with daylight savings shifts +* Improve metrics in Spark UI +* Refactor Parquet decode microkernels and support load balancing RLE runs +* Improve get_json performance +* Support dynamic scan filtering +* Improve UCX shuffle +* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases) + +Note: There is a known issue in the 24.08.1 release when decompressing gzip files on H100 GPUs. +Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661). + +For a detailed list of changes, please refer to the +[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md). + ## Release v24.06.1 ### Hardware Requirements: diff --git a/docs/archives/CHANGELOG_24.02-to-24.04.md b/docs/archives/CHANGELOG_24.02-to-24.06.md similarity index 81% rename from docs/archives/CHANGELOG_24.02-to-24.04.md rename to docs/archives/CHANGELOG_24.02-to-24.06.md index dbcacf3133e..d95307a1efe 100644 --- a/docs/archives/CHANGELOG_24.02-to-24.04.md +++ b/docs/archives/CHANGELOG_24.02-to-24.06.md @@ -1,5 +1,127 @@ # Change log -Generated on 2024-08-06 +Generated on 2024-10-09 +## Release 24.06 + +### Features +||| +|:---|:---| +|[#10850](https://github.com/NVIDIA/spark-rapids/issues/10850)|[FEA] Refine the test framework introduced in #10745| +|[#6969](https://github.com/NVIDIA/spark-rapids/issues/6969)|[FEA] Support parse_url | +|[#10496](https://github.com/NVIDIA/spark-rapids/issues/10496)|[FEA] Drop support for CentOS7| +|[#10760](https://github.com/NVIDIA/spark-rapids/issues/10760)|[FEA]Support ArrayFilter| +|[#10721](https://github.com/NVIDIA/spark-rapids/issues/10721)|[FEA] Dump the complete set of build-info properties to the Spark eventLog| +|[#10666](https://github.com/NVIDIA/spark-rapids/issues/10666)|[FEA] Create Spark 3.4.3 shim| + +### Performance +||| +|:---|:---| +|[#8963](https://github.com/NVIDIA/spark-rapids/issues/8963)|[FEA] Use custom kernel for parse_url| +|[#10817](https://github.com/NVIDIA/spark-rapids/issues/10817)|[FOLLOW ON] Combining regex parsing in transpiling and regex rewrite in `rlike`| +|[#10821](https://github.com/NVIDIA/spark-rapids/issues/10821)|Rewrite `pattern[A-B]{X,Y}` (a pattern string followed by X to Y chars in range A - B) in `RLIKE` to a custom kernel| + +### Bugs Fixed +||| +|:---|:---| +|[#10928](https://github.com/NVIDIA/spark-rapids/issues/10928)|[BUG] 24.06 test_conditional_with_side_effects_case_when test failed on Scala 2.13 with DATAGEN_SEED=1716656294| +|[#10941](https://github.com/NVIDIA/spark-rapids/issues/10941)|[BUG] Failed to build on databricks due to GpuOverrides.scala:4264: not found: type GpuSubqueryBroadcastMeta| +|[#10902](https://github.com/NVIDIA/spark-rapids/issues/10902)|Spark UT failed: SPARK-37360: Timestamp type inference for a mix of TIMESTAMP_NTZ and TIMESTAMP_LTZ| +|[#10899](https://github.com/NVIDIA/spark-rapids/issues/10899)|[BUG] format_number Spark UT failed because Type conversion is not allowed| +|[#10913](https://github.com/NVIDIA/spark-rapids/issues/10913)|[BUG] rlike with empty pattern failed with 'NoSuchElementException' when enabling regex rewrite| +|[#10774](https://github.com/NVIDIA/spark-rapids/issues/10774)|[BUG] Issues found by Spark UT Framework on RapidsRegexpExpressionsSuite| +|[#10606](https://github.com/NVIDIA/spark-rapids/issues/10606)|[BUG] Update Plugin to use the new `getPartitionedFile` method| +|[#10806](https://github.com/NVIDIA/spark-rapids/issues/10806)|[BUG] orc_write_test.py::test_write_round_trip_corner failed with DATAGEN_SEED=1715517863| +|[#10831](https://github.com/NVIDIA/spark-rapids/issues/10831)|[BUG] Failed to read data from iceberg| +|[#10810](https://github.com/NVIDIA/spark-rapids/issues/10810)|[BUG] NPE when running `ParseUrl` tests in `RapidsStringExpressionsSuite`| +|[#10797](https://github.com/NVIDIA/spark-rapids/issues/10797)|[BUG] udf_test test_single_aggregate_udf, test_group_aggregate_udf and test_group_apply_udf_more_types failed on DB 13.3| +|[#10719](https://github.com/NVIDIA/spark-rapids/issues/10719)|[BUG] test_exact_percentile_groupby FAILED: hash_aggregate_test.py::test_exact_percentile_groupby with DATAGEN seed 1713362217| +|[#10738](https://github.com/NVIDIA/spark-rapids/issues/10738)|[BUG] test_exact_percentile_groupby_partial_fallback_to_cpu failed with DATAGEN_SEED=1713928179| +|[#10768](https://github.com/NVIDIA/spark-rapids/issues/10768)|[DOC] Dead links with tools pages| +|[#10751](https://github.com/NVIDIA/spark-rapids/issues/10751)|[BUG] Cascaded Pandas UDFs not working as expected on Databricks when plugin is enabled| +|[#10318](https://github.com/NVIDIA/spark-rapids/issues/10318)|[BUG] `fs.azure.account.keyInvalid` configuration issue while reading from Unity Catalog Tables on Azure DB| +|[#10722](https://github.com/NVIDIA/spark-rapids/issues/10722)|[BUG] "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| +|[#10724](https://github.com/NVIDIA/spark-rapids/issues/10724)|[BUG] Failed to convert string with invisible characters to float| +|[#10633](https://github.com/NVIDIA/spark-rapids/issues/10633)|[BUG] ScanJson and JsonToStructs can give almost random errors| +|[#10659](https://github.com/NVIDIA/spark-rapids/issues/10659)|[BUG] from_json ArrayIndexOutOfBoundsException in 24.02| +|[#10656](https://github.com/NVIDIA/spark-rapids/issues/10656)|[BUG] Databricks cache tests failing with host memory OOM| + +### PRs +||| +|:---|:---| +|[#11222](https://github.com/NVIDIA/spark-rapids/pull/11222)|Update change log for v24.06.1 release [skip ci]| +|[#11221](https://github.com/NVIDIA/spark-rapids/pull/11221)|Change cudf version back to 24.06.0-SNAPSHOT [skip ci]| +|[#11217](https://github.com/NVIDIA/spark-rapids/pull/11217)|Update latest changelog [skip ci]| +|[#11211](https://github.com/NVIDIA/spark-rapids/pull/11211)|Use fixed seed for test_from_json_struct_decimal| +|[#11203](https://github.com/NVIDIA/spark-rapids/pull/11203)|Update version to 24.06.1-SNAPSHOT| +|[#11205](https://github.com/NVIDIA/spark-rapids/pull/11205)|Update docs for 24.06.1 release [skip ci]| +|[#11056](https://github.com/NVIDIA/spark-rapids/pull/11056)|Update latest changelog [skip ci]| +|[#11052](https://github.com/NVIDIA/spark-rapids/pull/11052)|Add spark343 shim for scala2.13 dist jar| +|[#10981](https://github.com/NVIDIA/spark-rapids/pull/10981)|Update latest changelog [skip ci]| +|[#10984](https://github.com/NVIDIA/spark-rapids/pull/10984)|[DOC] Update docs for 24.06.0 release [skip ci]| +|[#10974](https://github.com/NVIDIA/spark-rapids/pull/10974)|Update rapids JNI and private dependency to 24.06.0| +|[#10830](https://github.com/NVIDIA/spark-rapids/pull/10830)|Use ErrorClass to Throw AnalysisException| +|[#10947](https://github.com/NVIDIA/spark-rapids/pull/10947)|Prevent contains-PrefixRange optimization if not preceded by wildcards| +|[#10934](https://github.com/NVIDIA/spark-rapids/pull/10934)|Revert "Add Support for Multiple Filtering Keys for Subquery Broadcast "| +|[#10870](https://github.com/NVIDIA/spark-rapids/pull/10870)|Add support for self-contained profiling| +|[#10903](https://github.com/NVIDIA/spark-rapids/pull/10903)|Use upper case for LEGACY_TIME_PARSER_POLICY to fix a spark UT| +|[#10900](https://github.com/NVIDIA/spark-rapids/pull/10900)|Fix type convert error in format_number scalar input| +|[#10868](https://github.com/NVIDIA/spark-rapids/pull/10868)|Disable default cuDF pinned pool| +|[#10914](https://github.com/NVIDIA/spark-rapids/pull/10914)|Fix NoSuchElementException when rlike with empty pattern| +|[#10858](https://github.com/NVIDIA/spark-rapids/pull/10858)|Add Support for Multiple Filtering Keys for Subquery Broadcast | +|[#10861](https://github.com/NVIDIA/spark-rapids/pull/10861)|refine ut framework including Part 1 and Part 2| +|[#10872](https://github.com/NVIDIA/spark-rapids/pull/10872)|[DOC] ignore released plugin links to reduce the bother info [skip ci]| +|[#10839](https://github.com/NVIDIA/spark-rapids/pull/10839)|Replace anonymous classes for SortOrder and FIlterExec overrides| +|[#10873](https://github.com/NVIDIA/spark-rapids/pull/10873)|Auto merge PRs to branch-24.08 from branch-24.06 [skip ci]| +|[#10860](https://github.com/NVIDIA/spark-rapids/pull/10860)|[Spark 4.0] Account for `PartitionedFileUtil.getPartitionedFile` signature change.| +|[#10822](https://github.com/NVIDIA/spark-rapids/pull/10822)|Rewrite regex pattern `literal[a-b]{x}` to custom kernel in rlike| +|[#10833](https://github.com/NVIDIA/spark-rapids/pull/10833)|Filter out unused json_path tokens| +|[#10855](https://github.com/NVIDIA/spark-rapids/pull/10855)|Fix auto merge conflict 10845 [[skip ci]]| +|[#10826](https://github.com/NVIDIA/spark-rapids/pull/10826)|Add NVTX ranges to identify Spark stages and tasks| +|[#10836](https://github.com/NVIDIA/spark-rapids/pull/10836)|Catch exceptions when trying to examine Iceberg scan for metadata queries| +|[#10824](https://github.com/NVIDIA/spark-rapids/pull/10824)|Support zstd for GPU shuffle compression| +|[#10828](https://github.com/NVIDIA/spark-rapids/pull/10828)|Added DateTimeUtilsShims [Databricks]| +|[#10829](https://github.com/NVIDIA/spark-rapids/pull/10829)|Fix `Inheritance Shadowing` to add support for Spark 4.0.0| +|[#10811](https://github.com/NVIDIA/spark-rapids/pull/10811)|Fix NPE in GpuParseUrl for null keys.| +|[#10723](https://github.com/NVIDIA/spark-rapids/pull/10723)|Implement chunked ORC reader| +|[#10715](https://github.com/NVIDIA/spark-rapids/pull/10715)|Rewrite some rlike expression to StartsWith/Contains| +|[#10820](https://github.com/NVIDIA/spark-rapids/pull/10820)|workaround #10801 temporally| +|[#10812](https://github.com/NVIDIA/spark-rapids/pull/10812)|Replace ThreadPoolExecutor creation with ThreadUtils API| +|[#10813](https://github.com/NVIDIA/spark-rapids/pull/10813)|Fix the errors for Pandas UDF tests on DB13.3| +|[#10795](https://github.com/NVIDIA/spark-rapids/pull/10795)|Remove fixed seed for exact `percentile` integration tests| +|[#10805](https://github.com/NVIDIA/spark-rapids/pull/10805)|Drop Support for CentOS 7| +|[#10800](https://github.com/NVIDIA/spark-rapids/pull/10800)|Add number normalization test and address followup for getJsonObject| +|[#10796](https://github.com/NVIDIA/spark-rapids/pull/10796)|fixing build break on DBR| +|[#10791](https://github.com/NVIDIA/spark-rapids/pull/10791)|Fix auto merge conflict 10779 [skip ci]| +|[#10636](https://github.com/NVIDIA/spark-rapids/pull/10636)|Update actions version [skip ci]| +|[#10743](https://github.com/NVIDIA/spark-rapids/pull/10743)|initial PR for the framework reusing Vanilla Spark's unit tests| +|[#10767](https://github.com/NVIDIA/spark-rapids/pull/10767)|Add rows-only batches support to RebatchingRoundoffIterator| +|[#10763](https://github.com/NVIDIA/spark-rapids/pull/10763)|Add in the GpuArrayFilter command| +|[#10766](https://github.com/NVIDIA/spark-rapids/pull/10766)|Fix dead links related to tools documentation [skip ci]| +|[#10644](https://github.com/NVIDIA/spark-rapids/pull/10644)|Add logging to Integration test runs in local and local-cluster mode| +|[#10756](https://github.com/NVIDIA/spark-rapids/pull/10756)|Fix Authorization Failure While Reading Tables From Unity Catalog| +|[#10752](https://github.com/NVIDIA/spark-rapids/pull/10752)|Add SparkRapidsBuildInfoEvent to the event log| +|[#10754](https://github.com/NVIDIA/spark-rapids/pull/10754)|Substitute whoami for $USER| +|[#10755](https://github.com/NVIDIA/spark-rapids/pull/10755)|[DOC] Update README for prioritize-commits script [skip ci]| +|[#10728](https://github.com/NVIDIA/spark-rapids/pull/10728)|Let big data gen set nullability recursively| +|[#10740](https://github.com/NVIDIA/spark-rapids/pull/10740)|Use parse_url kernel for PATH parsing| +|[#10734](https://github.com/NVIDIA/spark-rapids/pull/10734)|Add short circuit path for get-json-object when there is separate wildcard path| +|[#10725](https://github.com/NVIDIA/spark-rapids/pull/10725)|Initial definition for Spark 4.0.0 shim| +|[#10635](https://github.com/NVIDIA/spark-rapids/pull/10635)|Use new getJsonObject kernel for json_tuple| +|[#10739](https://github.com/NVIDIA/spark-rapids/pull/10739)|Use fixed seed for some random failed tests| +|[#10720](https://github.com/NVIDIA/spark-rapids/pull/10720)|Add Shims for Spark 3.4.3| +|[#10716](https://github.com/NVIDIA/spark-rapids/pull/10716)|Remove the mixedType config for JSON as it has no downsides any longer| +|[#10733](https://github.com/NVIDIA/spark-rapids/pull/10733)|Fix "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| +|[#10718](https://github.com/NVIDIA/spark-rapids/pull/10718)|Change parameters for memory limit in Parquet chunked reader| +|[#10292](https://github.com/NVIDIA/spark-rapids/pull/10292)|Upgrade to UCX 1.16.0| +|[#10709](https://github.com/NVIDIA/spark-rapids/pull/10709)|Removing some authorizations for departed users [skip ci]| +|[#10726](https://github.com/NVIDIA/spark-rapids/pull/10726)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#10708](https://github.com/NVIDIA/spark-rapids/pull/10708)|Updated dump tool to verify get_json_object| +|[#10706](https://github.com/NVIDIA/spark-rapids/pull/10706)|Fix auto merge conflict 10704 [skip ci]| +|[#10675](https://github.com/NVIDIA/spark-rapids/pull/10675)|Fix merge conflict with branch-24.04 [skip ci]| +|[#10678](https://github.com/NVIDIA/spark-rapids/pull/10678)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#10662](https://github.com/NVIDIA/spark-rapids/pull/10662)|Audit script - Check commits from shuffle and storage directories [skip ci]| +|[#10655](https://github.com/NVIDIA/spark-rapids/pull/10655)|Update rapids jni/private dependency to 24.06| +|[#10652](https://github.com/NVIDIA/spark-rapids/pull/10652)|Substitute murmurHash32 for spark32BitMurmurHash3| + ## Release 24.04 ### Features diff --git a/docs/compatibility.md b/docs/compatibility.md index f9af6764498..1cc0e80902a 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -651,6 +651,8 @@ guaranteed to produce the same results as the CPU: - `dd/MM/yyyy` - `yyyy/MM/dd` - `yyyy-MM-dd` +- `yyyyMMdd` +- `yyyymmdd` - `yyyy/MM/dd HH:mm:ss` - `yyyy-MM-dd HH:mm:ss` @@ -659,6 +661,11 @@ LEGACY timeParserPolicy support has the following limitations when running on th - Only 4 digit years are supported - The proleptic Gregorian calendar is used instead of the hybrid Julian+Gregorian calendar that Spark uses in legacy mode +- When format is `yyyyMMdd`, GPU only supports 8 digit strings. Spark supports like 7 digit + `2024101` string while GPU does not support. Only tested `UTC` and `Asia/Shanghai` timezones. +- When format is `yyyymmdd`, GPU only supports 8 digit strings. Spark supports like 7 digit + `2024101` string while GPU does not support. Only tested `UTC` and `Asia/Shanghai` timezones. + ## Formatting dates and timestamps as strings diff --git a/docs/configs.md b/docs/configs.md index 5d6b386aa42..2d757a40779 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` @@ -45,7 +45,6 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.multiThreadedRead.numThreads|The maximum number of threads on each executor to use for reading small files in parallel. This can not be changed at runtime after the executor has started. Used with COALESCING and MULTITHREADED readers, see spark.rapids.sql.format.parquet.reader.type, spark.rapids.sql.format.orc.reader.type, or spark.rapids.sql.format.avro.reader.type for a discussion of reader types. If it is not set explicitly and spark.executor.cores is set, it will be tried to assign value of `max(MULTITHREAD_READ_NUM_THREADS_DEFAULT, spark.executor.cores)`, where MULTITHREAD_READ_NUM_THREADS_DEFAULT = 20.|20|Startup spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647|Runtime spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647|Runtime -spark.rapids.sql.shuffle.spillThreads|Number of threads used to spill shuffle data to disk in the background.|6|Runtime spark.rapids.sql.udfCompiler.enabled|When set to true, Scala UDFs will be considered for compilation as Catalyst expressions|false|Runtime For more advanced configs, please refer to the [RAPIDS Accelerator for Apache Spark Advanced Configuration](./additional-functionality/advanced_configs.md) page. diff --git a/docs/dev/lore.md b/docs/dev/lore.md index 4a7725b4bfd..b155df54052 100644 --- a/docs/dev/lore.md +++ b/docs/dev/lore.md @@ -38,7 +38,10 @@ partitions. You also need to set `spark.rapids.sql.lore.dumpPath` to tell LORE where to dump the data, the value of which should point to a directory. All dumped data of a query will live in this -directory. A typical directory hierarchy would look like this: +directory. Note, the directory may either not exist, in which case it will be created, or it should be empty. +If the directory exists and contains files, an `IllegalArgumentException` will be thrown to prevent overwriting existing data. + +A typical directory hierarchy would look like this: ```console + loreId-10/ diff --git a/docs/dev/shims.md b/docs/dev/shims.md index c3a4b57de7f..0d62eb4cae8 100644 --- a/docs/dev/shims.md +++ b/docs/dev/shims.md @@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi Spark 3.0.2's URLs: ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark302/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark302/ ``` Spark 3.2.0's URLs : ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark320/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/ ``` ### Late Inheritance in Public Classes diff --git a/docs/dev/testing.md b/docs/dev/testing.md index af4d97d1699..9f1c33091f1 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -5,5 +5,5 @@ nav_order: 2 parent: Developer Overview --- An overview of testing can be found within the repository at: -* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.10/tests#readme) -* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.10/integration_tests#readme) +* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/tests#readme) +* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/integration_tests#readme) diff --git a/docs/download.md b/docs/download.md index eb67fb77bfb..85f04e6a912 100644 --- a/docs/download.md +++ b/docs/download.md @@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details. -## Release v24.08.1 +## Release v24.10.0 ### Hardware Requirements: The plugin is tested on the following architectures: @@ -42,7 +42,7 @@ The plugin is tested on the following architectures: Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4 Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3 - Apache Spark 3.5.0, 3.5.1 + Apache Spark 3.5.0, 3.5.1, 3.5.2 Supported Databricks runtime versions for Azure and AWS: Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0) @@ -68,14 +68,14 @@ for your hardware's minimum driver version. ### RAPIDS Accelerator's Support Policy for Apache Spark The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html) -### Download RAPIDS Accelerator for Apache Spark v24.08.1 +### Download RAPIDS Accelerator for Apache Spark v24.10.0 | Processor | Scala Version | Download Jar | Download Signature | |-----------|---------------|--------------|--------------------| -| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1.jar.asc) | -| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1.jar.asc) | -| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.08.1/rapids-4-spark_2.12-24.08.1-cuda11-arm64.jar.asc) | -| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.08.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.08.1/rapids-4-spark_2.13-24.08.1-cuda11-arm64.jar.asc) | +| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar.asc) | +| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar.asc) | +| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar.asc) | +| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar.asc) | This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with CUDA 11.8 through CUDA 12.0. @@ -84,24 +84,24 @@ CUDA 11.8 through CUDA 12.0. * Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com). * Import the public key: `gpg --import PUB_KEY` * Verify the signature for Scala 2.12 jar: - `gpg --verify rapids-4-spark_2.12-24.08.1.jar.asc rapids-4-spark_2.12-24.08.1.jar` + `gpg --verify rapids-4-spark_2.12-24.10.0.jar.asc rapids-4-spark_2.12-24.10.0.jar` * Verify the signature for Scala 2.13 jar: - `gpg --verify rapids-4-spark_2.13-24.08.1.jar.asc rapids-4-spark_2.13-24.08.1.jar` + `gpg --verify rapids-4-spark_2.13-24.10.0.jar.asc rapids-4-spark_2.13-24.10.0.jar` The output of signature verify: gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) " ### Release Notes -* Support timezones with daylight savings shifts -* Improve metrics in Spark UI -* Refactor Parquet decode microkernels and support load balancing RLE runs -* Improve get_json performance -* Support dynamic scan filtering -* Improve UCX shuffle +* Optimize scheduling policy for GPU Semaphore +* Support distinct join for right outer joins +* Support MinBy and MaxBy for non-float ordering +* Support ArrayJoin expression +* Optimize Expand and Aggregate expression performance +* Improve JSON related expressions * For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases) -Note: There is a known issue in the 24.08.1 release when decompressing gzip files on H100 GPUs. +Note: There is a known issue in the 24.10.0 release when decompressing gzip files on H100 GPUs. Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661). For a detailed list of changes, please refer to the diff --git a/integration_tests/README.md b/integration_tests/README.md index 547c23052b4..f5237de21a0 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -263,7 +263,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-24.10.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.10.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -286,7 +286,7 @@ If you just want to verify the SQL replacement is working you will need to add t assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -443,7 +443,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.10.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf ``` ### Enabling fuzz tests diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md index f46e228ddd0..d9f47fab5cb 100644 --- a/integration_tests/ScaleTest.md +++ b/integration_tests/ScaleTest.md @@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \ -./target/rapids-4-spark-integration-tests_2.12-24.10.0-SNAPSHOT-spark332.jar \ +./target/rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-spark332.jar \ 10 \ 100 \ parquet \ diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 3ea20b75610..aaff3455298 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT integration_tests diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 96e68e4a550..83cc4922b3b 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -18,7 +18,7 @@ from conftest import is_not_utc, is_supported_time_zone, is_dataproc_serverless_runtime from data_gen import * from spark_session import * -from marks import allow_non_gpu, approximate_float, datagen_overrides, tz_sensitive_test +from marks import allow_non_gpu, approximate_float, datagen_overrides, disable_ansi_mode, tz_sensitive_test from pyspark.sql.types import * from spark_init_internal import spark_version from datetime import date, datetime @@ -26,13 +26,27 @@ _decimal_gen_36_5 = DecimalGen(precision=36, scale=5) -def test_cast_empty_string_to_int(): + +def test_cast_empty_string_to_int_ansi_off(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, StringGen(pattern="")).selectExpr( 'CAST(a as BYTE)', 'CAST(a as SHORT)', 'CAST(a as INTEGER)', - 'CAST(a as LONG)')) + 'CAST(a as LONG)'), + conf=ansi_disabled_conf) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11552") +def test_cast_empty_string_to_int_ansi_on(): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, StringGen(pattern="")).selectExpr( + 'CAST(a as BYTE)', + 'CAST(a as SHORT)', + 'CAST(a as INTEGER)', + 'CAST(a as LONG)').collect(), + conf=ansi_enabled_conf, + error_message="cannot be cast to ") # These tests are not intended to be exhaustive. The scala test CastOpSuite should cover # just about everything for non-nested values. This is intended to check that the @@ -61,12 +75,22 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) -def test_cast_string_date_valid_format(): +def test_cast_string_date_valid_format_ansi_off(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())), - conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) + conf = copy_and_update(ansi_disabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False})) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11556") +def test_cast_string_date_valid_format_ansi_on(): + # In Spark 3.2.0+ the valid format changed, and we cannot support all formats. + # This provides values that are valid in all of those formats. + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())).collect(), + conf = copy_and_update(ansi_enabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False}), + error_message="One or more values could not be converted to DateType") invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim '1970 T', ' 1970-01T', '1970-01 A', # not conform to "yyyy-[M]M" after trim @@ -94,8 +118,8 @@ def test_cast_string_date_invalid_ansi_before_320(): data_rows = [(v,) for v in values_string_to_data] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}, ) + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}, ) # test Spark versions >= 320 and databricks, ANSI mode, valid values @pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date") @@ -103,8 +127,8 @@ def test_cast_string_date_valid_ansi(): data_rows = [(v,) for v in valid_values_string_to_date] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}) + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}) # test Spark versions >= 320, ANSI mode @pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") @@ -112,8 +136,8 @@ def test_cast_string_date_valid_ansi(): def test_cast_string_date_invalid_ansi(invalid): assert_gpu_and_cpu_error( lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())).collect(), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}, + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}, error_message="DateTimeException") @@ -144,7 +168,8 @@ def test_cast_string_date_non_ansi(): data_rows = [(v,) for v in values_string_to_data] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) + conf=copy_and_update(ansi_disabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False})) + @pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1), StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), @@ -153,32 +178,65 @@ def test_cast_string_date_non_ansi(): ids=idfn) @tz_sensitive_test @allow_non_gpu(*non_utc_allow) -def test_cast_string_ts_valid_format(data_gen): +def test_cast_string_ts_valid_format_ansi_off(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())), - conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) + conf = copy_and_update(ansi_disabled_conf, + {'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.rapids.sql.castStringToTimestamp.enabled': True})) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11556") +@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1)], + ids=idfn) +@tz_sensitive_test +@allow_non_gpu(*non_utc_allow) +def test_cast_string_ts_valid_format_ansi_on(data_gen): + # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. + # This provides values that are valid in all of those formats. + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())), + conf = copy_and_update(ansi_enabled_conf, + {'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.rapids.sql.castStringToTimestamp.enabled': True})) + @allow_non_gpu('ProjectExec', 'Cast', 'Alias') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years") -def test_cast_string_date_fallback(): +def test_cast_string_date_fallback_ansi_off(): + """ + This tests that STRING->DATE conversion is run on CPU, via a fallback. + The point of this test is to exercise the fallback, and not to examine any errors in casting. + There is no change in behaviour between Apache Spark and the plugin, since they're both + exercising the CPU implementation. Therefore, this needn't be tested with ANSI enabled. + """ assert_gpu_fallback_collect( # Cast back to String because this goes beyond what python can support for years lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(DateType()).cast(StringType())), - 'Cast') + 'Cast', + conf=ansi_disabled_conf) @allow_non_gpu('ProjectExec', 'Cast', 'Alias') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years") def test_cast_string_timestamp_fallback(): + """ + This tests that STRING->TIMESTAMP conversion is run on CPU, via a fallback. + The point of this test is to exercise the fallback, and not to examine any errors in casting. + There is no change in behaviour between Apache Spark and the plugin, since they're both + exercising the CPU implementation. Therefore, this needn't be tested with ANSI enabled. + """ assert_gpu_fallback_collect( # Cast back to String because this goes beyond what python can support for years lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(TimestampType()).cast(StringType())), 'Cast', - conf = {'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) + conf = copy_and_update(ansi_disabled_conf, + {'spark.rapids.sql.castStringToTimestamp.enabled': True})) +@disable_ansi_mode # In ANSI mode, there are restrictions to casting DECIMAL to other types. + # ANSI mode behaviour is tested in test_ansi_cast_decimal_to. @approximate_float @pytest.mark.parametrize('data_gen', [ decimal_gen_32bit, @@ -191,10 +249,10 @@ def test_cast_string_timestamp_fallback(): DecimalGen(precision=38, scale=10), DecimalGen(precision=36, scale=-5), DecimalGen(precision=38, scale=-10)], ids=meta_idfn('from:')) @pytest.mark.parametrize('to_type', [ByteType(), ShortType(), IntegerType(), LongType(), FloatType(), DoubleType(), StringType()], ids=meta_idfn('to:')) -def test_cast_decimal_to(data_gen, to_type): +def test_with_ansi_disabled_cast_decimal_to(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')), - conf = {'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + conf = {'spark.rapids.sql.castDecimalToFloat.enabled': True}) @approximate_float @pytest.mark.parametrize('data_gen', [ @@ -210,6 +268,8 @@ def test_ansi_cast_decimal_to(data_gen, to_type): conf = {'spark.rapids.sql.castDecimalToFloat.enabled': True, 'spark.sql.ansi.enabled': True}) + +@disable_ansi_mode # With ANSI enabled, casting from wider to narrower types will fail. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10050') @pytest.mark.parametrize('data_gen', [ DecimalGen(7, 1), @@ -226,10 +286,24 @@ def test_ansi_cast_decimal_to(data_gen, to_type): DecimalType(30, -4), DecimalType(38, -10), DecimalType(1, -1)], ids=meta_idfn('to:')) -def test_cast_decimal_to_decimal(data_gen, to_type): +def test_with_ansi_disabled_cast_decimal_to_decimal(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a'))) + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11550") +@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10050') +@pytest.mark.parametrize('data_gen', [ + DecimalGen(3, 0)], ids=meta_idfn('from:')) +@pytest.mark.parametrize('to_type', [ + DecimalType(1, -1)], ids=meta_idfn('to:')) +def test_ansi_cast_failures_decimal_to_decimal(data_gen, to_type): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')).collect(), + conf=ansi_enabled_conf, + error_message="overflow occurred") + + @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen], ids=idfn) @pytest.mark.parametrize('to_type', [ DecimalType(2, 0), @@ -240,10 +314,21 @@ def test_cast_decimal_to_decimal(data_gen, to_type): DecimalType(10, 2), DecimalType(18, 0), DecimalType(18, 2)], ids=idfn) -def test_cast_integral_to_decimal(data_gen, to_type): +def test_cast_integral_to_decimal_ansi_off(data_gen, to_type): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).select( + f.col('a').cast(to_type)), + conf=ansi_disabled_conf) + + +@pytest.mark.skip("https://github.com/NVIDIA/spark-rapids/issues/11550") +@pytest.mark.parametrize('data_gen', [long_gen], ids=idfn) +@pytest.mark.parametrize('to_type', [DecimalType(2, 0)], ids=idfn) +def test_cast_integral_to_decimal_ansi_on(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( - f.col('a').cast(to_type))) + f.col('a').cast(to_type)), + conf=ansi_enabled_conf) def test_cast_byte_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( @@ -278,11 +363,28 @@ def test_cast_long_to_decimal_overflow(): DecimalType(30, 3), DecimalType(5, -3), DecimalType(3, 0)], ids=idfn) -def test_cast_floating_point_to_decimal(data_gen, to_type): +def test_cast_floating_point_to_decimal_ansi_off(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( f.col('a'), f.col('a').cast(to_type)), - conf={'spark.rapids.sql.castFloatToDecimal.enabled': 'true'}) + conf=copy_and_update( + ansi_disabled_conf, + {'spark.rapids.sql.castFloatToDecimal.enabled': True})) + + +@pytest.mark.skip("https://github.com/NVIDIA/spark-rapids/issues/11550") +@pytest.mark.parametrize('data_gen', [FloatGen(special_cases=_float_special_cases)]) +@pytest.mark.parametrize('to_type', [DecimalType(7, 1)]) +def test_cast_floating_point_to_decimal_ansi_on(data_gen, to_type): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, data_gen).select( + f.col('a'), + f.col('a').cast(to_type)).collect(), + conf=copy_and_update( + ansi_enabled_conf, + {'spark.rapids.sql.castFloatToDecimal.enabled': True}), + error_message="[NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION]") + # casting these types to string should be passed basic_gens_for_cast_to_string = [ByteGen, ShortGen, IntegerGen, LongGen, StringGen, BooleanGen, DateGen, TimestampGen] @@ -322,9 +424,8 @@ def _assert_cast_to_string_equal (data_gen, conf): ) -@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/11437") @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -348,19 +449,18 @@ def test_cast_double_to_string(): assert from_cpu_float == from_gpu_float @pytest.mark.parametrize('data_gen', [ArrayGen(sub) for sub in not_matched_struct_array_gens_for_cast_to_string], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not exact match') def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) -@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/11437") @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -369,18 +469,18 @@ def test_cast_map_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', not_matched_map_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not exact match') def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -402,7 +502,7 @@ def was_broken_for_nested_null(spark): assert_gpu_and_cpu_are_equal_collect( was_broken_for_nested_null, - {"spark.sql.legacy.castComplexTypesToString.enabled": 'true' if cast_conf == 'LEGACY' else 'false'} + {"spark.sql.legacy.castComplexTypesToString.enabled": True if cast_conf == 'LEGACY' else False} ) # https://github.com/NVIDIA/spark-rapids/issues/2315 @@ -419,16 +519,16 @@ def broken_df(spark): assert_gpu_and_cpu_are_equal_collect( broken_df, - {"spark.sql.legacy.castComplexTypesToString.enabled": 'true' if cast_conf == 'LEGACY' else 'false'} + {"spark.sql.legacy.castComplexTypesToString.enabled": True if cast_conf == 'LEGACY' else False} ) @pytest.mark.parametrize('data_gen', [StructGen([["first", element_gen]]) for element_gen in not_matched_struct_array_gens_for_cast_to_string], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not an exact match') def test_cast_struct_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) @@ -483,13 +583,17 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_cast_float_to_timestamp_for_nan_inf(type): +def test_with_ansi_off_cast_float_to_timestamp_for_nan_inf(type): + """ + Tests the behaviour of floats when cast to timestamp, with ANSI disabled. + ANSI mode tests are covered in test_cast_float_to_timestamp_ansi_for_nan_inf. + """ def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] schema = StructType([StructField("value", type, True)]) df = spark.createDataFrame(data, schema) return df.select(f.col('value').cast(TimestampType())) - assert_gpu_and_cpu_are_equal_collect(fun) + assert_gpu_and_cpu_are_equal_collect(fun, conf=ansi_disabled_conf) # gen for casting long to timestamp, range is about in [0000, 9999] long_gen_to_timestamp = LongGen(max_val=math.floor((9999-1970) * 365 * 86400), @@ -556,11 +660,20 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) + +@pytest.mark.skipif(is_databricks_runtime() and is_databricks_version_or_later(14, 3), + reason="https://github.com/NVIDIA/spark-rapids/issues/11555") +@pytest.mark.skipif(not is_databricks_runtime() and is_spark_400_or_later(), + reason="https://github.com/NVIDIA/spark-rapids/issues/11555") def test_cast_timestamp_to_numeric_non_ansi(): + """ + Test timestamp->numeric conversions with ANSI off. + """ assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", - "cast(a as float)", "cast(a as double)")) + "cast(a as float)", "cast(a as double)"), + conf=ansi_disabled_conf) @allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_string(): @@ -728,8 +841,6 @@ def test_cast_int_to_string_not_UTC(): {"spark.sql.session.timeZone": "+08"}) not_utc_fallback_test_params = [(timestamp_gen, 'STRING'), - # python does not like year 0, and with time zones the default start date can become year 0 :( - (DateGen(start=date(1, 1, 1)), 'TIMESTAMP'), (SetValuesGen(StringType(), ['2023-03-20 10:38:50', '2023-03-20 10:39:02']), 'TIMESTAMP')] @allow_non_gpu('ProjectExec') @@ -739,9 +850,16 @@ def test_cast_fallback_not_UTC(from_gen, to_type): lambda spark: unary_op_df(spark, from_gen).selectExpr("CAST(a AS {}) as casted".format(to_type)), "Cast", {"spark.sql.session.timeZone": "+08", - "spark.rapids.sql.castStringToTimestamp.enabled": "true"}) + "spark.rapids.sql.castStringToTimestamp.enabled": True}) -def test_cast_date_integral_and_fp(): + +def test_cast_date_integral_and_fp_ansi_off(): + """ + This tests that a date column can be cast to different numeric/floating-point types. + This needs to be tested with ANSI disabled, because none of these conversions are + ANSI-compliant. + """ assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, date_gen).selectExpr( - "cast(a as boolean)", "cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) + "cast(a as boolean)", "cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)"), + conf=ansi_disabled_conf) diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 099eb28c053..813f1a77c94 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from data_gen import * from pyspark.sql.types import * + +from spark_session import is_before_spark_400 from string_test import mk_str_gen import pyspark.sql.functions as f import pyspark.sql.utils @@ -326,8 +328,11 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) def test_sequence_too_long_sequence(stop_gen): - msg = "Too long sequence" if is_before_spark_334() or (not is_before_spark_340() and is_before_spark_342()) \ - or is_spark_350() else "Unsuccessful try to create array with" + msg = "Too long sequence" if is_before_spark_334() \ + or (not is_before_spark_340() and is_before_spark_342()) \ + or is_spark_350() \ + else "Can't create array" if not is_before_spark_400() \ + else "Unsuccessful try to create array with" assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. lambda spark:unary_op_df(spark, stop_gen, 1).selectExpr( diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 0c877f00238..a38cac3c0a7 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -13,7 +13,7 @@ # limitations under the License. import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_are_equal_sql from conftest import is_utc, is_supported_time_zone, get_test_tz from data_gen import * from datetime import date, datetime, timezone @@ -459,6 +459,23 @@ def test_to_timestamp(parser_policy): .select(f.col("a"), f.to_timestamp(f.col("a"), "yyyy-MM-dd HH:mm:ss")), { "spark.sql.legacy.timeParserPolicy": parser_policy}) +# mm: minute; MM: month +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.parametrize("format", ['yyyyMMdd', 'yyyymmdd'], ids=idfn) +# Test years after 1900, refer to issues: https://github.com/NVIDIA/spark-rapids/issues/11543, https://github.com/NVIDIA/spark-rapids/issues/11539 +@pytest.mark.skipif(get_test_tz() != "Asia/Shanghai" and get_test_tz() != "UTC", reason="https://github.com/NVIDIA/spark-rapids/issues/11562") +def test_formats_for_legacy_mode(format): + gen = StringGen('(19[0-9]{2}|[2-9][0-9]{3})([0-9]{4})') + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, gen), + "tab", + '''select unix_timestamp(a, '{}'), + from_unixtime(unix_timestamp(a, '{}'), '{}'), + date_format(to_timestamp(a, '{}'), '{}') + from tab + '''.format(format, format, format, format, format), + {'spark.sql.legacy.timeParserPolicy': 'LEGACY', + 'spark.rapids.sql.incompatibleDateFormats.enabled': True}) @tz_sensitive_test @pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") @@ -671,3 +688,15 @@ def test_timestamp_millis_long_overflow(): def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) + + +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.parametrize('parser_policy', ['LEGACY', 'CORRECTED', 'EXCEPTION'], ids=idfn) +def test_date_to_timestamp(parser_policy): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, date_gen), + "tab", + "SELECT cast(a as timestamp) from tab", + conf = { + "spark.sql.legacy.timeParserPolicy": parser_policy, + "spark.rapids.sql.incompatibleDateFormats.enabled": True}) diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py index c9dec8afac9..136a4b041f8 100644 --- a/integration_tests/src/main/python/json_matrix_test.py +++ b/integration_tests/src/main/python/json_matrix_test.py @@ -17,9 +17,6 @@ from asserts import * from data_gen import * -from conftest import is_not_utc -from datetime import timezone -from conftest import is_databricks_runtime from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides from spark_session import * @@ -406,7 +403,6 @@ def test_json_tuple_allow_backslash_escape_any_off(std_input_path): # Off is the default for scan so it really needs to work @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) -@pytest.mark.xfail(reason = 'https://github.com/NVIDIA/spark-rapids/issues/10457') def test_scan_json_allow_unquoted_control_chars_off(std_input_path, read_func, spark_tmp_table_factory): assert_gpu_and_cpu_are_equal_collect( read_func(std_input_path + '/' + WITH_UNQUOTED_CONTROL_FILE, @@ -417,7 +413,6 @@ def test_scan_json_allow_unquoted_control_chars_off(std_input_path, read_func, s # Off is the default for from_json so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 -@pytest.mark.xfail(reason = 'https://github.com/NVIDIA/spark-rapids/issues/10457') def test_from_json_allow_unquoted_control_chars_off(std_input_path): schema = WITH_UNQUOTED_CONTROL_SCHEMA assert_gpu_and_cpu_are_equal_collect( @@ -583,6 +578,7 @@ def test_json_tuple_dec_locale_non_aribic(std_input_path): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -590,7 +586,10 @@ def test_json_tuple_dec_locale_non_aribic(std_input_path): "timestamp_formatted_strings.json", "timestamp_tz_formatted_strings.json"] -@pytest.mark.parametrize('input_file', COMMON_TEST_FILES) +COMMON_SCAN_TEST_FILES = COMMON_TEST_FILES + [ + "scan_emtpy_lines.json"] + +@pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_bytes(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -607,7 +606,7 @@ def test_from_json_bytes(std_input_path, input_file): lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_json_to_structs_conf) -@pytest.mark.parametrize('input_file', COMMON_TEST_FILES) +@pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_shorts(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -624,7 +623,7 @@ def test_from_json_shorts(std_input_path, input_file): lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_json_to_structs_conf) -@pytest.mark.parametrize('input_file', COMMON_TEST_FILES) +@pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_ints(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -641,7 +640,7 @@ def test_from_json_ints(std_input_path, input_file): lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_json_to_structs_conf) -@pytest.mark.parametrize('input_file', COMMON_TEST_FILES) +@pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_longs(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -661,8 +660,8 @@ def test_from_json_longs(std_input_path, input_file): @pytest.mark.parametrize('dt', [DecimalType(38,0), DecimalType(38,10), DecimalType(10,2)], ids=idfn) @pytest.mark.parametrize('input_file', [ "int_formatted.json", - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "float_formatted.json", + "sci_formatted.json", "int_formatted_strings.json", "float_formatted_strings.json", "sci_formatted_strings.json", @@ -673,12 +672,14 @@ def test_from_json_longs(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", - "timestamp_tz_formatted_strings.json"]) + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, input_file, dt): assert_gpu_and_cpu_are_equal_collect( @@ -690,8 +691,8 @@ def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, inpu @pytest.mark.parametrize('dt', [DecimalType(38,0), DecimalType(38,10), DecimalType(10,2)], ids=idfn) @pytest.mark.parametrize('input_file', [ "int_formatted.json", - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "float_formatted.json", + "sci_formatted.json", "int_formatted_strings.json", "float_formatted_strings.json", "sci_formatted_strings.json", @@ -702,6 +703,7 @@ def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, inpu "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -725,17 +727,19 @@ def test_from_json_decs(std_input_path, input_file, dt): "sci_formatted_strings.json", "decimal_locale_formatted_strings.json", pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/10495')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10479')), - pytest.param("invalid_ridealong_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + "boolean_formatted.json", + "invalid_ridealong_columns.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15318')), "int_struct_formatted.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_spark_400_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/11154')), + "bad_whitespace.json", "escaped_strings.json", pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", - "timestamp_tz_formatted_strings.json"]) + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_strings(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -753,11 +757,12 @@ def test_scan_json_strings(std_input_path, read_func, spark_tmp_table_factory, i "sci_formatted_strings.json", "decimal_locale_formatted_strings.json", "single_quoted_strings.json", - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10479')), - pytest.param("invalid_ridealong_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + "boolean_formatted.json", + "invalid_ridealong_columns.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15318')), "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -785,7 +790,8 @@ def test_from_json_strings(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11386')), + "bad_whitespace.json", + "escaped_strings.json", pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11387')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", @@ -813,6 +819,7 @@ def test_get_json_object_formats(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -851,6 +858,7 @@ def test_get_json_object_child_formats(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11386')), pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11387')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -867,7 +875,7 @@ def test_json_tuple_formats(std_input_path, input_file): '''json_tuple(json, 'user.profile.username', 'user.skills[0]', 'user.projects[1].name') AS (username, first_skill, second_project_name)'''), conf =_enable_json_tuple_conf) -@pytest.mark.parametrize('input_file', COMMON_TEST_FILES) +@pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_bools(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -888,7 +896,7 @@ def test_from_json_bools(std_input_path, input_file): @pytest.mark.parametrize('input_file', [ "int_formatted.json", pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10481')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "sci_formatted.json", "int_formatted_strings.json", pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://issues.apache.org/jira/browse/SPARK-38060')), "sci_formatted_strings.json", @@ -898,12 +906,14 @@ def test_from_json_bools(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", - "timestamp_tz_formatted_strings.json"]) + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -916,7 +926,7 @@ def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, in @pytest.mark.parametrize('input_file', [ "int_formatted.json", pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10481')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "sci_formatted.json", "int_formatted_strings.json", pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://issues.apache.org/jira/browse/SPARK-38060')), "sci_formatted_strings.json", @@ -926,6 +936,7 @@ def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, in "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -943,7 +954,7 @@ def test_from_json_floats(std_input_path, input_file): @pytest.mark.parametrize('input_file', [ "int_formatted.json", pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10481')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "sci_formatted.json", "int_formatted_strings.json", pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://issues.apache.org/jira/browse/SPARK-38060')), "sci_formatted_strings.json", @@ -953,12 +964,14 @@ def test_from_json_floats(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", - "timestamp_tz_formatted_strings.json"]) + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -971,7 +984,7 @@ def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, i @pytest.mark.parametrize('input_file', [ "int_formatted.json", pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10481')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15280')), + "sci_formatted.json", "int_formatted_strings.json", pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://issues.apache.org/jira/browse/SPARK-38060')), "sci_formatted_strings.json", @@ -981,6 +994,7 @@ def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, i "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -1007,12 +1021,14 @@ def test_from_json_doubles(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391'))]) + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11391')), + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) @allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_factory, input_file): @@ -1036,6 +1052,7 @@ def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_fa "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -1063,12 +1080,14 @@ def test_from_json_corrected_dates(std_input_path, input_file): "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/6846'))]) + pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/6846')), + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) @allow_non_gpu(*non_utc_allow) def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_table_factory, input_file): @@ -1092,6 +1111,7 @@ def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_tab "int_array_formatted.json", "int_struct_formatted.json", "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", "escaped_strings.json", "nested_escaped_strings.json", pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), @@ -1107,24 +1127,26 @@ def test_from_json_corrected_timestamps(std_input_path, input_file): conf = conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), + "bad_whitespace.json", + "escaped_strings.json", "nested_escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_long_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1134,24 +1156,25 @@ def test_scan_json_long_arrays(std_input_path, read_func, spark_tmp_table_factor conf=_enable_all_types_json_scan_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), + "bad_whitespace.json", + "escaped_strings.json", "nested_escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_long_arrays(std_input_path, input_file): schema = StructType([StructField("data", ArrayType(LongType()))]) @@ -1160,24 +1183,26 @@ def test_from_json_long_arrays(std_input_path, input_file): conf =_enable_json_to_structs_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10574')), "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", "nested_escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_string_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1187,24 +1212,25 @@ def test_scan_json_string_arrays(std_input_path, read_func, spark_tmp_table_fact conf=_enable_all_types_json_scan_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10574')), "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", "nested_escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_string_arrays(std_input_path, input_file): schema = StructType([StructField("data", ArrayType(StringType()))]) @@ -1213,24 +1239,26 @@ def test_from_json_string_arrays(std_input_path, input_file): conf =_enable_json_to_structs_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", "int_array_formatted.json", pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_long_structs(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1240,24 +1268,25 @@ def test_scan_json_long_structs(std_input_path, read_func, spark_tmp_table_facto conf=_enable_all_types_json_scan_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", "int_array_formatted.json", pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_long_structs(std_input_path, input_file): schema = StructType([StructField("data", StructType([StructField("A", LongType()),StructField("B", LongType())]))]) @@ -1266,24 +1295,26 @@ def test_from_json_long_structs(std_input_path, input_file): conf =_enable_json_to_structs_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_string_structs(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( @@ -1293,24 +1324,25 @@ def test_scan_json_string_structs(std_input_path, read_func, spark_tmp_table_fac conf=_enable_all_types_json_scan_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_string_structs(std_input_path, input_file): schema = StructType([StructField("data", StructType([StructField("A", StringType()),StructField("B", StringType())]))]) @@ -1320,23 +1352,25 @@ def test_from_json_string_structs(std_input_path, input_file): @pytest.mark.parametrize('dt', [DecimalType(38,0), DecimalType(10,2)], ids=idfn) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), # This does not fail on 38,0 "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_stringted_.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), + "bad_whitespace.json", + "escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_dec_arrays(std_input_path, read_func, spark_tmp_table_factory, input_file, dt): assert_gpu_and_cpu_are_equal_collect( @@ -1347,24 +1381,25 @@ def test_scan_json_dec_arrays(std_input_path, read_func, spark_tmp_table_factory @pytest.mark.parametrize('dt', [DecimalType(38,0), DecimalType(10,2)], ids=idfn) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), # This does not fail on 38,0 "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), + "bad_whitespace.json", + "escaped_strings.json", "nested_escaped_strings.json", - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_dec_arrays(std_input_path, input_file, dt): schema = StructType([StructField("data", ArrayType(dt))]) @@ -1373,24 +1408,26 @@ def test_from_json_dec_arrays(std_input_path, input_file, dt): conf =_enable_json_to_structs_conf) @pytest.mark.parametrize('input_file', [ - pytest.param("int_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("int_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("float_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("sci_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("decimal_locale_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("single_quoted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", - pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", - pytest.param("timestamp_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260')), - pytest.param("timestamp_tz_formatted_strings.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15260'))]) + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json", + "scan_emtpy_lines.json"]) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_mixed_struct(std_input_path, read_func, spark_tmp_table_factory, input_file): assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index ca5eb135715..fe1d9064933 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -23,6 +23,14 @@ from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides from spark_session import * +TEXT_INPUT_EXEC='FileSourceScanExec' + +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +non_utc_file_source_scan_allow = ['FileSourceScanExec'] if is_not_utc() else [] + +non_utc_project_allow = ['ProjectExec'] if is_not_utc() else [] + + json_supported_gens = [ # Spark does not escape '\r' or '\n' even though it uses it to mark end of record # This would require multiLine reads to work correctly, so we avoid these chars @@ -350,6 +358,53 @@ def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_ options), conf=updated_conf) +@approximate_float +@pytest.mark.parametrize('filename', [ + 'boolean.json', + 'boolean_invalid.json', + 'ints.json', + pytest.param('ints_invalid.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4940')), # This fails for dates, as not all are invalid + 'nan_and_inf.json', + pytest.param('nan_and_inf_strings.json', marks=pytest.mark.skipif(is_before_spark_330(), reason='https://issues.apache.org/jira/browse/SPARK-38060 fixed in Spark 3.3.0')), + 'nan_and_inf_invalid.json', + 'floats.json', + 'floats_leading_zeros.json', + 'floats_invalid.json', + 'floats_edge_cases.json', + 'decimals.json', + 'dates.json', + 'dates_invalid.json', +]) +@pytest.mark.parametrize('schema', [_bool_schema, _byte_schema, _short_schema, _int_schema, _long_schema, \ + _float_schema, _double_schema, _decimal_10_2_schema, _decimal_10_3_schema, \ + _date_schema], ids=idfn) +@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false']) +@pytest.mark.parametrize('allow_numeric_leading_zeros', [ + 'true', + 'false' +]) +@pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_project_allow) +@pytest.mark.parametrize('date_format', [None, 'yyyy-MM-dd']) +def test_basic_from_json(std_input_path, filename, schema, allow_non_numeric_numbers, \ + allow_numeric_leading_zeros, ansi_enabled, date_format): + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.ansi.enabled': ansi_enabled, + 'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}) + options = {"allowNonNumericNumbers": allow_non_numeric_numbers, + "allowNumericLeadingZeros": allow_numeric_leading_zeros, + } + + if date_format: + options['dateFormat'] = date_format + + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.read.text(std_input_path + '/' + filename). + selectExpr("value as json"). + select(f.col("json"), f.from_json(f.col("json"), schema, options)), + conf=updated_conf) + + @ignore_order @pytest.mark.parametrize('filename', [ 'malformed1.ndjson', @@ -533,11 +588,6 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an else: assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) -# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' -non_utc_file_source_scan_allow = ['FileSourceScanExec'] if is_not_utc() else [] - -non_utc_project_allow = ['ProjectExec'] if is_not_utc() else [] - @approximate_float @pytest.mark.parametrize('filename', [ 'timestamps.json', @@ -769,9 +819,6 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format 'ProjectExec', conf=conf) -# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' -non_utc_project_allow = ['ProjectExec'] if is_not_utc() else [] - @pytest.mark.parametrize('timestamp_gen', [ # "yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]" "\"" + optional_whitespace_regex + "[1-8]{1}[0-9]{3}-[0-3]{1,2}-[0-3]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\\.[0-9]{1,6})?Z?" + optional_whitespace_regex + "\"", diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 6032d469fb2..7d041b387e4 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -485,6 +485,8 @@ def test_parquet_read_buffer_allocation_empty_blocks(spark_tmp_path, v1_enabled_ lambda spark : spark.read.parquet(data_path).filter("id < 2 or id > 990"), conf=all_confs) + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/7733") @@ -797,6 +799,8 @@ def test_parquet_read_nano_as_longs_true(std_input_path): 'FileSourceScanExec', conf=conf) + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 def test_many_column_project(): def _create_wide_data_frame(spark, num_cols): schema_dict = {} @@ -1285,27 +1289,64 @@ def test_parquet_read_case_insensitivity(spark_tmp_path): ) -# test read INT32 as INT8/INT16/Date -@pytest.mark.parametrize('reader_confs', reader_opt_confs) -@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -def test_parquet_int32_downcast(spark_tmp_path, reader_confs, v1_enabled_list): +def run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs, + v1_enabled_list, + ansi_conf): + """ + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, with ANSI mode enabled/disabled. + """ data_path = spark_tmp_path + '/PARQUET_DATA' write_schema = [("d", date_gen), ('s', short_gen), ('b', byte_gen)] + + # For test setup, write with ANSI disabled. + # Otherwise, CAST(d AS INT) will fail on Spark CPU. with_cpu_session( lambda spark: gen_df(spark, write_schema).selectExpr( "cast(d as Int) as d", "cast(s as Int) as s", - "cast(b as Int) as b").write.parquet(data_path)) + "cast(b as Int) as b").write.parquet(data_path), conf=ansi_disabled_conf) read_schema = StructType([StructField("d", DateType()), StructField("s", ShortType()), StructField("b", ByteType())]) conf = copy_and_update(reader_confs, - {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + {'spark.sql.sources.useV1SourceList': v1_enabled_list}, + ansi_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.schema(read_schema).parquet(data_path), conf=conf) + +@pytest.mark.parametrize('reader_confs', reader_opt_confs) +@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +def test_parquet_int32_downcast_ansi_disabled(spark_tmp_path, reader_confs, v1_enabled_list): + """ + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, with ANSI mode disabled. + """ + run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs, + v1_enabled_list, + ansi_disabled_conf) + + +def test_parquet_int32_downcast_ansi_enabled(spark_tmp_path): + """ + This is the flipside of test_parquet_int32_downcast_ansi_disabled. + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, now tested with ANSI + enabled. + A limited combination of test parameters is used to test ANSI enabled, + in the interest of brevity. + """ + run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs=native_parquet_file_reader_conf, + v1_enabled_list="", + ansi_conf=ansi_disabled_conf) + + @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize("types", [("byte", "short"), ("byte", "int"), ("short", "int")], ids=idfn) @@ -1340,6 +1381,10 @@ def test_parquet_nested_column_missing(spark_tmp_path, reader_confs, v1_enabled_ lambda spark: spark.read.schema(read_schema).parquet(data_path), conf=conf) +@pytest.mark.skipif(condition=is_databricks_runtime() and is_databricks_version_or_later(14,3), + reason="https://github.com/NVIDIA/spark-rapids/issues/11512") +@pytest.mark.skipif(condition=is_spark_400_or_later(), + reason="https://github.com/NVIDIA/spark-rapids/issues/11512") def test_parquet_check_schema_compatibility(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('int', int_gen), ('long', long_gen), ('dec32', decimal_gen_32bit)] @@ -1431,13 +1476,16 @@ def test_parquet_read_encryption(spark_tmp_path, reader_confs, v1_enabled_list): assert_spark_exception( lambda: with_gpu_session( lambda spark: spark.read.parquet(data_path).collect()), - error_message='Could not read footer for file') + error_message='Could not read footer') # Common message fragment between all Spark versions. + # Note that this isn't thrown explicitly by the plugin. assert_spark_exception( lambda: with_gpu_session( lambda spark: spark.read.parquet(data_path).collect(), conf=conf), error_message='The GPU does not support reading encrypted Parquet files') + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 def test_parquet_read_count(spark_tmp_path): parquet_gens = [int_gen, string_gen, double_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index c2062605ca1..0de404953a9 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -284,6 +284,7 @@ def test_re_replace(): # We have shims to support empty strings for zero-repetition patterns # See https://github.com/NVIDIA/spark-rapids/issues/5456 +@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/11600") def test_re_replace_repetition(): gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') assert_gpu_and_cpu_are_equal_collect( @@ -698,6 +699,7 @@ def test_regexp_octal_digits(): ), conf=_regexp_conf) +@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/11600") def test_regexp_replace_digit(): gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ @@ -1076,6 +1078,7 @@ def test_regexp_memory_fallback(): } ) +@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/11600") def test_regexp_memory_ok(): gen = StringGen('test') assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/udf_cudf_test.py b/integration_tests/src/main/python/udf_cudf_test.py index 6d94a5da206..59069820d29 100644 --- a/integration_tests/src/main/python/udf_cudf_test.py +++ b/integration_tests/src/main/python/udf_cudf_test.py @@ -41,11 +41,6 @@ from marks import cudf_udf -if is_databricks_runtime() and is_spark_340_or_later(): - # Databricks 13.3 does not use separate reader/writer threads for Python UDFs - # which can lead to hangs. Skipping these tests until the Python UDF handling is updated. - pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493") - _conf = { 'spark.rapids.sql.exec.AggregateInPandasExec': 'true', 'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true', diff --git a/integration_tests/src/test/resources/bad_whitespace.json b/integration_tests/src/test/resources/bad_whitespace.json new file mode 100644 index 00000000000..0f3edebe336 --- /dev/null +++ b/integration_tests/src/test/resources/bad_whitespace.json @@ -0,0 +1,10 @@ +{"data": 1 . 0} +{"data": - 1 . 0} +{"data": + 1 . 0} +{"data": 1 E 1} +{"data": n u l l} +{"data": t r u e} +{"data": f a l s e} +{"data": 1 0} +{"data": 1, "other": 1 0} +{"data": "BAD NUM 1 000", "ride-along-num": 1 000} diff --git a/integration_tests/src/test/resources/float_formatted.json b/integration_tests/src/test/resources/float_formatted.json index 8f305c3dbed..c0b3dacdce8 100644 --- a/integration_tests/src/test/resources/float_formatted.json +++ b/integration_tests/src/test/resources/float_formatted.json @@ -20,6 +20,3 @@ {"data": 0.9999} {"data": +1.0} {"data": -1.0} -{"data": 1 . 0} -{"data": - 1 . 0} -{"data": + 1 . 0} diff --git a/integration_tests/src/test/resources/invalid_ridealong_columns.json b/integration_tests/src/test/resources/invalid_ridealong_columns.json index e45013747d5..00092f2e436 100644 --- a/integration_tests/src/test/resources/invalid_ridealong_columns.json +++ b/integration_tests/src/test/resources/invalid_ridealong_columns.json @@ -14,7 +14,6 @@ {"data": "BAD NUM +1", "ride-along-num": +1} {"data": "BAD NUM 01", "ride-along-num": 01} {"data": "BAD NUM 00.1", "ride-along-num": 00.1} -{"data": "BAD NUM 1 000", "ride-along-num": 1 000} {"data": "BAD NUM 1,000", "ride-along-num": 1,000} {"data": "BAD NUM 1e", "ride-along-num": 1e} {"data": "BAD NUM 1ee2", "ride-along-num": 1ee2} diff --git a/integration_tests/src/test/resources/scan_emtpy_lines.json b/integration_tests/src/test/resources/scan_emtpy_lines.json new file mode 100644 index 00000000000..4845cf918b8 --- /dev/null +++ b/integration_tests/src/test/resources/scan_emtpy_lines.json @@ -0,0 +1,23 @@ + + + + +{"BAD"} + + + + +{"BAD"} + + + + +{"BAD"} + + + + + + + + diff --git a/integration_tests/src/test/resources/sci_formatted.json b/integration_tests/src/test/resources/sci_formatted.json index 2cc39c84308..d42056d8914 100644 --- a/integration_tests/src/test/resources/sci_formatted.json +++ b/integration_tests/src/test/resources/sci_formatted.json @@ -13,4 +13,3 @@ {"data": 1E-1} {"data": 1E+1} {"data": 1e1} -{"data": 1 E 1} diff --git a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala index 93d4ddb82a8..843cabdfd7e 100644 --- a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala +++ b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala @@ -55,8 +55,7 @@ class MortgageSparkSuite extends AnyFunSuite { builder.getOrCreate() } - // test failing, tracked by https://github.com/NVIDIA/spark-rapids/issues/11436 - ignore("extract mortgage data") { + test("extract mortgage data") { val df = Run.csv( session, getClass.getClassLoader.getResource("Performance_2007Q3.txt_0").getPath, diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml index 13f0c0a8b79..caaa47245a8 100644 --- a/jdk-profiles/pom.xml +++ b/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia rapids-4-spark-jdk-profiles_2.12 pom Shim JDK Profiles - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT jdk8 diff --git a/jenkins/Dockerfile-blossom.integration.rocky b/jenkins/Dockerfile-blossom.integration.rocky index b293bba640f..5dbe24c9d17 100644 --- a/jenkins/Dockerfile-blossom.integration.rocky +++ b/jenkins/Dockerfile-blossom.integration.rocky @@ -45,17 +45,16 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm -f ~/miniconda.sh ENV PATH="/opt/conda/bin:$MAVEN_HOME/bin:${PATH}" -RUN conda init -# TODO: re-enable mamba solver after https://github.com/NVIDIA/spark-rapids/issues/9393 -# conda config --set solver libmamba +RUN conda init && conda install -n base -c conda-forge mamba # 'pyarrow' and 'pandas' will be installed as the dependencies of cudf below RUN export CUDA_VER=`echo ${CUDA_VER} | cut -d '.' -f 1,2` && \ - conda install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults cudf=${CUDF_VER} python=3.10 cuda-version=${CUDA_VER} && \ - conda install -y spacy && python -m spacy download en_core_web_sm && \ - conda install -y -c anaconda pytest requests && \ - conda install -y -c conda-forge sre_yield && \ - conda clean -ay + mamba install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ + cudf=${CUDF_VER} python=3.10 cuda-version=${CUDA_VER} && \ + mamba install -y spacy && python -m spacy download en_core_web_sm && \ + mamba install -y -c anaconda pytest requests && \ + mamba install -y -c conda-forge sre_yield && \ + mamba clean -ay # install pytest plugins for xdist parallel run RUN python -m pip install findspark pytest-xdist pytest-order fastparquet==2024.5.0 diff --git a/jenkins/Dockerfile-blossom.integration.ubuntu b/jenkins/Dockerfile-blossom.integration.ubuntu index b33309881e8..bd9c048a716 100644 --- a/jenkins/Dockerfile-blossom.integration.ubuntu +++ b/jenkins/Dockerfile-blossom.integration.ubuntu @@ -57,17 +57,16 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm -f ~/miniconda.sh ENV PATH="/opt/conda/bin:$MAVEN_HOME/bin:${PATH}" -RUN conda init -# TODO: re-enable mamba solver after https://github.com/NVIDIA/spark-rapids/issues/9393 -# conda config --set solver libmamba +RUN conda init && conda install -n base -c conda-forge mamba # 'pyarrow' and 'pandas' will be installed as the dependencies of cudf below RUN export CUDA_VER=`echo ${CUDA_VER} | cut -d '.' -f 1,2` && \ - conda install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults cudf=${CUDF_VER} python=3.10 cuda-version=${CUDA_VER} && \ - conda install -y spacy && python -m spacy download en_core_web_sm && \ - conda install -y -c anaconda pytest requests && \ - conda install -y -c conda-forge sre_yield && \ - conda clean -ay + mamba install -y -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ + cudf=${CUDF_VER} python=3.10 cuda-version=${CUDA_VER} && \ + mamba install -y spacy && python -m spacy download en_core_web_sm && \ + mamba install -y -c anaconda pytest requests && \ + mamba install -y -c conda-forge sre_yield && \ + mamba clean -ay # install pytest plugins for xdist parallel run RUN python -m pip install findspark pytest-xdist pytest-order fastparquet==2024.5.0 diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index 831197c61d9..25bade91968 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -57,6 +57,7 @@ initialize() if [[ ! -d $HOME/apache-maven-3.6.3 ]]; then wget https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz -P /tmp tar xf /tmp/apache-maven-3.6.3-bin.tar.gz -C $HOME + rm -f /tmp/apache-maven-3.6.3-bin.tar.gz sudo ln -s $HOME/apache-maven-3.6.3/bin/mvn /usr/local/bin/mvn fi diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py index 990b997c310..01b8757d835 100644 --- a/jenkins/databricks/create.py +++ b/jenkins/databricks/create.py @@ -27,7 +27,7 @@ def main(): workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' token = '' sshkey = '' - cluster_name = 'CI-GPU-databricks-24.10.0-SNAPSHOT' + cluster_name = 'CI-GPU-databricks-24.12.0-SNAPSHOT' idletime = 240 runtime = '7.0.x-gpu-ml-scala2.12' num_workers = 1 diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh index 685df1db482..1153acb68ec 100644 --- a/jenkins/databricks/cudf_udf_test.sh +++ b/jenkins/databricks/cudf_udf_test.sh @@ -16,7 +16,7 @@ # # This script sets the environment to run cudf_udf tests of RAPIDS Accelerator for Apache Spark on DB. -# cudf conda packages need to be installed in advance, please refer to +# cudf python packages need to be installed in advance, please refer to # './jenkins/databricks/init_cudf_udf.sh' to install. # All the environments can be overwritten by shell variables: # LOCAL_JAR_PATH: Location of the RAPIDS jars @@ -26,23 +26,20 @@ # - Running tests on Databricks: # `./jenkins/databricks/cudf-udf-test.sh` # To add support of a new runtime: -# 1. Check if any more dependencies need to be added to the apt/conda install commands. +# 1. Check if any more dependencies need to be added to the apt/conda/pip install commands. # 2. If you had to go beyond the above steps to support the new runtime, then update the # instructions accordingly. set -ex -# Try to use "cudf-udf" conda environment for the python cudf-udf tests. -CONDA_HOME=${CONDA_HOME:-"/databricks/conda"} -if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then - echo "Error not found cudf conda packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install." +# Try to use "cudf-udf" conda/pip environment for the python cudf-udf tests. +CUDF_PY_ENV=${CUDF_PY_ENV:-$(echo /databricks/*/envs/cudf-udf)} +if [ ! -d "${CUDF_PY_ENV}" ]; then + echo "Error not found cudf-py packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install." exit -1 fi -export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH -export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python # Set the path of python site-packages. -# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3 -PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))') -PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages" +PYTHON_SITE_PACKAGES=$(echo -n ${CUDF_PY_ENV}/*/lib/site-packages) +[ -d "${CUDF_PY_ENV}/bin" ] && export PATH=${CUDF_PY_ENV}/bin:$PATH SOURCE_PATH="/home/ubuntu/spark-rapids" [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index aed4e2a4c25..16b90b95c0e 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,7 +20,7 @@ set -ex -CUDF_VER=${CUDF_VER:-24.10} +CUDF_VER=${CUDF_VER:-24.12} CUDA_VER=${CUDA_VER:-11.8} # Need to explicitly add conda into PATH environment, to activate conda environment. @@ -28,30 +28,16 @@ export PATH=/databricks/conda/bin:$PATH # Set Python for the running instance export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"} PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("{}.{}".format(sys.version_info.major, sys.version_info.minor))') -# Rapids 23.06+ drops python 3.8 conda packages. ref: https://docs.rapids.ai/notices/rsn0029/ -if [[ "$(printf '%s\n' "3.9" "${PYTHON_VERSION}" | sort -V | head -n1)" = "3.9" ]]; then +# Rapids 24.10+ drops python 3.9 or below conda packages. ref: https://docs.rapids.ai/notices/rsn0040/ +if [[ "$(printf '%s\n' "3.10" "${PYTHON_VERSION}" | sort -V | head -n1)" == "3.10" ]]; then # To fix "'lsb_release -a' returned non-zero". ref: https://github.com/pypa/pip/issues/4924 [[ -n "$(which lsb_release)" ]] && mv $(which lsb_release) $(which lsb_release)"-bak" else - echo "Rapids 23.06+ drops python 3.8 or below versions of conda packages" + echo "Rapids 24.10+ drops python 3.9 or below versions of conda packages" exit -1 fi -base=$(conda info --base) -# Create and activate 'cudf-udf' conda env for cudf-udf tests -sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs -conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \ - source activate && \ - conda activate cudf-udf - -# Use mamba to install cudf-udf packages to speed up conda resolve time -conda install -y -c conda-forge mamba python=$PYTHON_VERSION -# Do not error out "This operation will remove conda without replacing it with another version of conda." for now -${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true - REQUIRED_PACKAGES=( - cuda-version=$CUDA_VER - cudf=$CUDF_VER findspark pandas pyarrow @@ -61,9 +47,42 @@ REQUIRED_PACKAGES=( requests sre_yield ) +if command -v conda >/dev/null 2>&1; then + base=$(conda info --base) + # Create and activate 'cudf-udf' conda env for cudf-udf tests + sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs + conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \ + source activate && \ + conda activate cudf-udf + + # Use mamba to install cudf-udf packages to speed up conda resolve time + conda install -y -c conda-forge mamba python=$PYTHON_VERSION + # Do not error out "This operation will remove conda without replacing it with another version of conda." for now + ${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true + + REQUIRED_PACKAGES=( + cuda-version=$CUDA_VER + cudf=$CUDF_VER + ${REQUIRED_PACKAGES[@]} + ) -${base}/envs/cudf-udf/bin/mamba install -y \ - -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ - "${REQUIRED_PACKAGES[@]}" + ${base}/envs/cudf-udf/bin/mamba install -y \ + -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ + "${REQUIRED_PACKAGES[@]}" -source deactivate && conda deactivate + source deactivate && conda deactivate +else + # pip install cudf-py, refer to: https://docs.rapids.ai/install#selector + # The prefix /databricks/python-bootstrap/ for PYTHON_SITE_PACKAGES is mandatory for Databricks init scripts + PYTHON_SITE_PACKAGES="/databricks/python-bootstrap/envs/cudf-udf/$PYTHON_VERSION/lib/site-packages" + pip install --target=${PYTHON_SITE_PACKAGES} \ + --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \ + "cudf-cu11>=${CUDF_VER}.0a0,<=${CUDF_VER}" + + REQUIRED_PACKAGES=( + ${REQUIRED_PACKAGES[@]} + scipy + numexpr + ) + pip install --target=${PYTHON_SITE_PACKAGES} ${REQUIRED_PACKAGES[@]} +fi diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index 5ea4fce625b..38728161d12 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -49,6 +49,7 @@ source jenkins/databricks/common_vars.sh BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-$(< /databricks/spark/VERSION)} SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM:-spark${BASE_SPARK_VERSION//./}db} SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM//\-SNAPSHOT/} +WITH_DEFAULT_UPSTREAM_SHIM=${WITH_DEFAULT_UPSTREAM_SHIM:-1} IS_SPARK_321_OR_LATER=0 [[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1 @@ -90,6 +91,18 @@ run_pyarrow_tests() { ## Separate the integration tests into "CI_PART1" and "CI_PART2", run each part in parallel on separate Databricks clusters to speed up the testing process. if [[ $TEST_MODE == "DEFAULT" || $TEST_MODE == "CI_PART1" ]]; then + # Run two-shim smoke test with the base Spark build + if [[ "$WITH_DEFAULT_UPSTREAM_SHIM" != "0" ]]; then + if [[ ! -d $HOME/spark-3.2.0-bin-hadoop3.2 ]]; then + wget https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -P /tmp + tar xf /tmp/spark-3.2.0-bin-hadoop3.2.tgz -C $HOME + rm -f /tmp/spark-3.2.0-bin-hadoop3.2.tgz + fi + SPARK_HOME=$HOME/spark-3.2.0-bin-hadoop3.2 \ + SPARK_SHELL_SMOKE_TEST=1 \ + PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.spark320.RapidsShuffleManager \ + bash integration_tests/run_pyspark_from_build.sh + fi bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE fi diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index de5392b4a1e..7f7ba8d65a9 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -182,7 +182,7 @@ if [[ $SKIP_DEPLOY != 'true' ]]; then distWithReducedPom "deploy" # this deploys selected submodules that is unconditionally built with Spark 3.2.0 - $MVN -B deploy -pl $DEPLOY_SUBMODULES \ + $MVN -B deploy -pl "!${DIST_PL}" \ -Dbuildver=$SPARK_BASE_SHIM_VERSION \ -DskipTests \ -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \ diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 71e580f38c5..e09fe78cbf7 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -24,39 +24,27 @@ nvidia-smi WORKSPACE=${WORKSPACE:-`pwd`} ARTF_ROOT="$WORKSPACE/jars" -MVN_GET_CMD="mvn -Dmaven.wagon.http.retryHandler.count=3 org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \ - -Dmaven.repo.local=$WORKSPACE/.m2 \ - $MVN_URM_MIRROR -Ddest=$ARTF_ROOT" +WGET_CMD="wget -q -P $ARTF_ROOT -t 3" rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT - -# TODO remove -Dtransitive=false workaround once pom is fixed -$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \ - -Dtransitive=false \ - -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=$SHUFFLE_SPARK_SHIM +$WGET_CMD $PROJECT_TEST_REPO/com/nvidia/rapids-4-spark-integration-tests_$SCALA_BINARY_VER/$PROJECT_TEST_VER/rapids-4-spark-integration-tests_$SCALA_BINARY_VER-$PROJECT_TEST_VER-${SHUFFLE_SPARK_SHIM}.jar CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility if [ "$CLASSIFIER"x == x ];then - $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \ - -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER - export RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT_VER.jar" + $WGET_CMD $PROJECT_REPO/com/nvidia/rapids-4-spark_$SCALA_BINARY_VER/$PROJECT_VER/rapids-4-spark_$SCALA_BINARY_VER-${PROJECT_VER}.jar + export RAPIDS_PLUGIN_JAR=$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-${PROJECT_VER}.jar else - $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \ - -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER -Dclassifier=$CLASSIFIER + $WGET_CMD $PROJECT_REPO/com/nvidia/rapids-4-spark_$SCALA_BINARY_VER/$PROJECT_VER/rapids-4-spark_$SCALA_BINARY_VER-$PROJECT_VER-${CLASSIFIER}.jar export RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT_VER-${CLASSIFIER}.jar" fi RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-$SHUFFLE_SPARK_SHIM.jar" export INCLUDE_SPARK_AVRO_JAR=${INCLUDE_SPARK_AVRO_JAR:-"true"} if [[ "${INCLUDE_SPARK_AVRO_JAR}" == "true" ]]; then - $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \ - -DgroupId=org.apache.spark -DartifactId=spark-avro_$SCALA_BINARY_VER -Dversion=$SPARK_VER + $WGET_CMD $PROJECT_REPO/org/apache/spark/spark-avro_$SCALA_BINARY_VER/$SPARK_VER/spark-avro_$SCALA_BINARY_VER-${SPARK_VER}.jar fi -# TODO remove -Dtransitive=false workaround once pom is fixed -$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \ - -Dtransitive=false \ - -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=pytest -Dpackaging=tar.gz +$WGET_CMD $PROJECT_TEST_REPO/com/nvidia/rapids-4-spark-integration-tests_$SCALA_BINARY_VER/$PROJECT_TEST_VER/rapids-4-spark-integration-tests_$SCALA_BINARY_VER-$PROJECT_TEST_VER-pytest.tar.gz RAPIDS_INT_TESTS_HOME="$ARTF_ROOT/integration_tests/" # The version of pytest.tar.gz that is uploaded is the one built against spark320 but its being pushed without classifier for now @@ -101,13 +89,12 @@ fi tar xzf "$RAPIDS_INT_TESTS_TGZ" -C $ARTF_ROOT && rm -f "$RAPIDS_INT_TESTS_TGZ" . jenkins/hadoop-def.sh $SPARK_VER ${SCALA_BINARY_VER} -wget -P $ARTF_ROOT $SPARK_REPO/org/apache/spark/$SPARK_VER/spark-$SPARK_VER-$BIN_HADOOP_VER.tgz +$WGET_CMD $SPARK_REPO/org/apache/spark/$SPARK_VER/spark-$SPARK_VER-$BIN_HADOOP_VER.tgz # Download parquet-hadoop jar for parquet-read encryption tests PARQUET_HADOOP_VER=`mvn help:evaluate -q -N -Dexpression=parquet.hadoop.version -DforceStdout -Dbuildver=${SHUFFLE_SPARK_SHIM/spark/}` if [[ "$(printf '%s\n' "1.12.0" "$PARQUET_HADOOP_VER" | sort -V | head -n1)" = "1.12.0" ]]; then - $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \ - -DgroupId=org.apache.parquet -DartifactId=parquet-hadoop -Dversion=$PARQUET_HADOOP_VER -Dclassifier=tests + $WGET_CMD $PROJECT_REPO/org/apache/parquet/parquet-hadoop/$PARQUET_HADOOP_VER/parquet-hadoop-$PARQUET_HADOOP_VER-tests.jar fi export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-$BIN_HADOOP_VER" diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index 62a796b50a5..8600a2f8689 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -29,8 +29,8 @@ IFS=$PRE_IFS CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"} CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility -PROJECT_VER=${PROJECT_VER:-"24.10.0-SNAPSHOT"} -PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.10.0-SNAPSHOT"} +PROJECT_VER=${PROJECT_VER:-"24.12.0-SNAPSHOT"} +PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.12.0-SNAPSHOT"} SPARK_VER=${SPARK_VER:-"3.2.0"} SPARK_VER_213=${SPARK_VER_213:-"3.3.0"} # Make a best attempt to set the default value for the shuffle shim. diff --git a/pom.xml b/pom.xml index 92a0c7170dd..f414a696739 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.12 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT pom https://nvidia.github.io/spark-rapids/ @@ -73,6 +73,7 @@ aggregator datagen + df_udf dist integration_tests shuffle-plugin @@ -747,8 +748,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.10.0-SNAPSHOT - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT + 24.12.0-SNAPSHOT 2.12 2.8.0 incremental diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml index dab195a92e7..d1fecdbf365 100644 --- a/scala2.13/aggregator/pom.xml +++ b/scala2.13/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.13 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT aggregator diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml index e6b678d2451..f236345c301 100644 --- a/scala2.13/api_validation/pom.xml +++ b/scala2.13/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT api_validation diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml index 4d818798101..d53ebc014c7 100644 --- a/scala2.13/datagen/pom.xml +++ b/scala2.13/datagen/pom.xml @@ -21,18 +21,19 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.13 Data Generator Tools for generating large amounts of data - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT datagen **/* package + ${project.build.outputDirectory}/datagen-version-info.properties diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml index 98bcfe03bc7..20c77038f40 100644 --- a/scala2.13/delta-lake/delta-20x/pom.xml +++ b/scala2.13/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml index 2ed9dc63043..75a41cfa8e0 100644 --- a/scala2.13/delta-lake/delta-21x/pom.xml +++ b/scala2.13/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml index 85c9fe54f10..c6111eb51a0 100644 --- a/scala2.13/delta-lake/delta-22x/pom.xml +++ b/scala2.13/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml index c2ad1d10871..84d1d7275c2 100644 --- a/scala2.13/delta-lake/delta-23x/pom.xml +++ b/scala2.13/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml index 6a848ced06e..0ffe6c84e10 100644 --- a/scala2.13/delta-lake/delta-24x/pom.xml +++ b/scala2.13/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml index 76b146d550a..3c30b1b0dc8 100644 --- a/scala2.13/delta-lake/delta-spark330db/pom.xml +++ b/scala2.13/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.13 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml index 2f65cd96559..a3501c1003c 100644 --- a/scala2.13/delta-lake/delta-spark332db/pom.xml +++ b/scala2.13/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.13 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml index f3004e2881d..c740362b11f 100644 --- a/scala2.13/delta-lake/delta-spark341db/pom.xml +++ b/scala2.13/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.13 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT false diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml index 5933fd44154..2f90b85acd7 100644 --- a/scala2.13/delta-lake/delta-stub/pom.xml +++ b/scala2.13/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.13 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/scala2.13/df_udf/pom.xml b/scala2.13/df_udf/pom.xml new file mode 100644 index 00000000000..04f7a6deb28 --- /dev/null +++ b/scala2.13/df_udf/pom.xml @@ -0,0 +1,88 @@ + + + + 4.0.0 + + com.nvidia + rapids-4-spark-shim-deps-parent_2.13 + 24.12.0-SNAPSHOT + ../shim-deps/pom.xml + + df_udf_plugin_2.13 + UDFs implemented in SQL/Dataframe + UDFs for Apache Spark implemented in SQL/Dataframe + 24.12.0-SNAPSHOT + + + df_udf + + **/* + package + ${project.build.outputDirectory}/df_udf-version-info.properties + + + + + org.scala-lang + scala-library + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.test.version} + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + true + + + + net.alchim31.maven + scala-maven-plugin + + + org.scalatest + scalatest-maven-plugin + + + org.apache.rat + apache-rat-plugin + + + + + + + ${project.build.directory}/extra-resources + + + + diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index 326b2ddebfe..15df1ec69f8 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.13 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 4c3ea72f341..88ab2531235 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT integration_tests diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml index 617a4239ac1..793bf0fb327 100644 --- a/scala2.13/jdk-profiles/pom.xml +++ b/scala2.13/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia rapids-4-spark-jdk-profiles_2.13 pom Shim JDK Profiles - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT jdk8 diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 14108a24d9b..e75635de413 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.13 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT pom https://nvidia.github.io/spark-rapids/ @@ -73,6 +73,7 @@ aggregator datagen + df_udf dist integration_tests shuffle-plugin @@ -747,8 +748,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.10.0-SNAPSHOT - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT + 24.12.0-SNAPSHOT 2.13 2.8.0 incremental diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml index 8297bc66564..95c49a2b1ca 100644 --- a/scala2.13/shim-deps/cloudera/pom.xml +++ b/scala2.13/shim-deps/cloudera/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-cdh-bom pom CDH Shim Dependencies - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/cloudera diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml index 8ca1e3cb7b0..9d6ff787ef1 100644 --- a/scala2.13/shim-deps/databricks/pom.xml +++ b/scala2.13/shim-deps/databricks/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-db-bom pom Databricks Shim Dependencies - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/databricks diff --git a/scala2.13/shim-deps/pom.xml b/scala2.13/shim-deps/pom.xml index d8a72da7afd..255488076b9 100644 --- a/scala2.13/shim-deps/pom.xml +++ b/scala2.13/shim-deps/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-shim-deps-parent_2.13 pom Shim Dependencies Profiles - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT release321cdh diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml index e9483779d19..b9e76b2f068 100644 --- a/scala2.13/shuffle-plugin/pom.xml +++ b/scala2.13/shuffle-plugin/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-shuffle_2.13 RAPIDS Accelerator for Apache Spark Shuffle Plugin Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT shuffle-plugin diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml index bf80b7505c9..3c48d7c13f2 100644 --- a/scala2.13/sql-plugin-api/pom.xml +++ b/scala2.13/sql-plugin-api/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql-plugin-api_2.13 Module for Non-Shimmable API - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT sql-plugin-api false diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml index f4a05786ce7..b96e1517690 100644 --- a/scala2.13/sql-plugin/pom.xml +++ b/scala2.13/sql-plugin/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql_2.13 RAPIDS Accelerator for Apache Spark SQL Plugin The RAPIDS SQL plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT sql-plugin diff --git a/scala2.13/tests/pom.xml b/scala2.13/tests/pom.xml index 964cafc8ebb..377dc4671fb 100644 --- a/scala2.13/tests/pom.xml +++ b/scala2.13/tests/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-tests_2.13 RAPIDS Accelerator for Apache Spark Tests RAPIDS plugin for Apache Spark integration tests - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT tests @@ -156,6 +156,19 @@ 3.1.0.0-RC2 test + + org.apache.parquet + parquet-column + ${parquet.hadoop.version} + test + tests + + + org.apache.parquet + parquet-avro + ${parquet.hadoop.version} + test + diff --git a/scala2.13/tools/pom.xml b/scala2.13/tools/pom.xml index ea16d37c5a3..a75a7b47941 100644 --- a/scala2.13/tools/pom.xml +++ b/scala2.13/tools/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-tools-support pom RAPIDS Accelerator for Apache Spark Tools Support Supporting code for RAPIDS Accelerator tools - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml index 13e6b2b3027..10ad46a48aa 100644 --- a/scala2.13/udf-compiler/pom.xml +++ b/scala2.13/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-udf_2.13 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT udf-compiler diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml index bbb95ff209d..a9b71366927 100644 --- a/shim-deps/cloudera/pom.xml +++ b/shim-deps/cloudera/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-cdh-bom pom CDH Shim Dependencies - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/cloudera diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml index 8cdb135ed26..edfa3d6f896 100644 --- a/shim-deps/databricks/pom.xml +++ b/shim-deps/databricks/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../../pom.xml rapids-4-spark-db-bom pom Databricks Shim Dependencies - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/databricks diff --git a/shim-deps/pom.xml b/shim-deps/pom.xml index c16b8eb56f0..bc1d9eeaa47 100644 --- a/shim-deps/pom.xml +++ b/shim-deps/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-shim-deps-parent_2.12 pom Shim Dependencies Profiles - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT release321cdh diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml index d0e8fc3bbee..69d8f1b765b 100644 --- a/shuffle-plugin/pom.xml +++ b/shuffle-plugin/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-shuffle_2.12 RAPIDS Accelerator for Apache Spark Shuffle Plugin Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT shuffle-plugin diff --git a/sql-plugin-api/pom.xml b/sql-plugin-api/pom.xml index 65cf422f83a..090a809fc05 100644 --- a/sql-plugin-api/pom.xml +++ b/sql-plugin-api/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql-plugin-api_2.12 Module for Non-Shimmable API - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT sql-plugin-api false diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala index d1c8c4d9ee5..bc35dad5372 100644 --- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala +++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala @@ -48,11 +48,11 @@ import org.apache.spark.util.MutableURLClassLoader Each shim can see a consistent parallel world without conflicts by referencing only one conflicting directory. E.g., Spark 3.2.0 Shim will use - jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark-shared/ - jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark320/ + jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ + jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/ Spark 3.3.1 will use - jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark-shared/ - jar:file:/home/spark/rapids-4-spark_2.12-24.10.0.jar!/spark331/ + jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ + jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark331/ Using these Jar URL's allows referencing different bytecode produced from identical sources by incompatible Scala / Spark dependencies. */ diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml index 8845b96edbe..c9cfb8ce99f 100644 --- a/sql-plugin/pom.xml +++ b/sql-plugin/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql_2.12 RAPIDS Accelerator for Apache Spark SQL Plugin The RAPIDS SQL plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT sql-plugin diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 8ae3450c0af..020220a679c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -31,7 +31,8 @@ import com.nvidia.spark.rapids.shims.{AnsiUtil, GpuCastShims, GpuIntervalUtils, import org.apache.commons.text.StringEscapeUtils import org.apache.spark.sql.catalyst.analysis.TypeCheckResult -import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, NullIntolerant, TimeZoneAwareExpression, UnaryExpression} +import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, NullIntolerant, TimeZoneAwareExpression} +import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_SECOND import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -40,7 +41,8 @@ import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.types._ /** Meta-data for cast and ansi_cast. */ -final class CastExprMeta[INPUT <: UnaryExpression with TimeZoneAwareExpression with NullIntolerant]( +final class CastExprMeta[ + INPUT <: UnaryLike[Expression] with TimeZoneAwareExpression with NullIntolerant]( cast: INPUT, val evalMode: GpuEvalMode.Value, conf: RapidsConf, @@ -76,7 +78,7 @@ final class CastExprMeta[INPUT <: UnaryExpression with TimeZoneAwareExpression w } /** Meta-data for cast, ansi_cast and ToPrettyString */ -abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpression]( +abstract class CastExprMetaBase[INPUT <: UnaryLike[Expression] with TimeZoneAwareExpression]( cast: INPUT, conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], @@ -90,6 +92,7 @@ abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpre override def isTimeZoneSupported: Boolean = { (fromType, toType) match { case (TimestampType, DateType) => true // this is for to_date(...) + case (DateType, TimestampType) => true case _ => false } } @@ -631,6 +634,11 @@ object GpuCast { zoneId.normalized())) { shifted => shifted.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType)) } + case (DateType, TimestampType) if options.timeZoneId.isDefined => + val zoneId = DateTimeUtils.getZoneId(options.timeZoneId.get) + withResource(input.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType))) { cv => + GpuTimeZoneDB.fromTimestampToUtcTimestamp(cv, zoneId.normalized()) + } case _ => input.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType)) } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala index e13d680a31d..942541d7320 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala @@ -53,7 +53,8 @@ class GpuExpandExecMeta( override def convertToGpu(): GpuExec = { val projections = gpuProjections.map(_.map(_.convertToGpu())) GpuExpandExec(projections, expand.output, childPlans.head.convertIfNeeded())( - preprojectEnabled = conf.isExpandPreprojectEnabled) + preprojectEnabled = conf.isExpandPreprojectEnabled, + coalesceAfter = conf.isCoalesceAfterExpandEnabled) } } @@ -65,15 +66,21 @@ class GpuExpandExecMeta( * output the same schema specified bye the parameter `output` * @param output Attribute references to Output * @param child Child operator + * @param preprojectEnabled Whether to enable pre-project before expanding + * @param coalesceAfter Whether to coalesce the output batches */ case class GpuExpandExec( projections: Seq[Seq[Expression]], output: Seq[Attribute], child: SparkPlan)( - preprojectEnabled: Boolean = false) extends ShimUnaryExecNode with GpuExec { + preprojectEnabled: Boolean = false, + override val coalesceAfter: Boolean = true +) extends ShimUnaryExecNode with GpuExec { override def otherCopyArgs: Seq[AnyRef] = Seq[AnyRef]( - preprojectEnabled.asInstanceOf[java.lang.Boolean]) + preprojectEnabled.asInstanceOf[java.lang.Boolean], + coalesceAfter.asInstanceOf[java.lang.Boolean] + ) private val PRE_PROJECT_TIME = "preprojectTime" override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpressions.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpressions.scala index 9da95461945..2fa33a597ca 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpressions.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpressions.scala @@ -25,7 +25,7 @@ import com.nvidia.spark.rapids.shims.{ShimBinaryExpression, ShimExpression, Shim import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} -import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.types.{DataType, StringType} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.unsafe.types.UTF8String @@ -52,6 +52,20 @@ object GpuExpressionsUtils { "implemented and should have been disabled") } + case class NullVecKey(d: DataType, n: Int) + + // accessOrder = true makes it LRU + class NullVecCache + extends java.util.LinkedHashMap[NullVecKey, GpuColumnVector](100, 0.75f, true) { + + override def clear(): Unit = { + super.values().forEach(_.close()) + super.clear() + } + } + + val cachedNullVectors = ThreadLocal.withInitial[NullVecCache](() => new NullVecCache) + /** * Tries to resolve a `GpuColumnVector` from a Scala `Any`. * @@ -73,7 +87,19 @@ object GpuExpressionsUtils { def resolveColumnVector(any: Any, numRows: Int): GpuColumnVector = { withResourceIfAllowed(any) { case c: GpuColumnVector => c.incRefCount() - case s: GpuScalar => GpuColumnVector.from(s, numRows, s.dataType) + case s: GpuScalar => + if (!s.isValid) { + val key = NullVecKey(s.dataType, numRows) + if (!cachedNullVectors.get.containsKey(key)) { + cachedNullVectors.get.put(key, + GpuColumnVector.from(s, numRows, s.dataType)) + } + + val ret = cachedNullVectors.get.get(key) + ret.incRefCount() + } else { + GpuColumnVector.from(s, numRows, s.dataType) + } case other => throw new IllegalArgumentException(s"Cannot resolve a ColumnVector from the value:" + s" $other. Please convert it to a GpuScalar or a GpuColumnVector before returning.") diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala index faca6d8e3c7..ab02d1f0eea 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala @@ -122,14 +122,21 @@ object MultiFileReaderThreadPool extends Logging { private var threadPool: Option[ThreadPoolExecutor] = None private def initThreadPool( - maxThreads: Int, + numThreadsFromConf: Int, keepAliveSeconds: Int = 60): ThreadPoolExecutor = synchronized { if (threadPool.isEmpty) { + val numThreads = Math.max(numThreadsFromConf, GpuDeviceManager.getNumCores) + + if (numThreadsFromConf != numThreads) { + logWarning(s"Configuring the file reader thread pool with a max of $numThreads " + + s"threads instead of ${RapidsConf.MULTITHREAD_READ_NUM_THREADS} = $numThreadsFromConf") + } + val threadPoolExecutor = - TrampolineUtil.newDaemonCachedThreadPool("multithreaded file reader worker", maxThreads, + TrampolineUtil.newDaemonCachedThreadPool("multithreaded file reader worker", numThreads, keepAliveSeconds) threadPoolExecutor.allowCoreThreadTimeOut(true) - logDebug(s"Using $maxThreads for the multithreaded reader thread pool") + logDebug(s"Using $numThreads for the multithreaded reader thread pool") threadPool = Some(threadPoolExecutor) } threadPool.get @@ -142,13 +149,7 @@ object MultiFileReaderThreadPool extends Logging { */ def getOrCreateThreadPool(numThreadsFromConf: Int): ThreadPoolExecutor = { threadPool.getOrElse { - val numThreads = Math.max(numThreadsFromConf, GpuDeviceManager.getNumCores) - - if (numThreadsFromConf != numThreads) { - logWarning(s"Configuring the file reader thread pool with a max of $numThreads " + - s"threads instead of ${RapidsConf.MULTITHREAD_READ_NUM_THREADS} = $numThreadsFromConf") - } - initThreadPool(numThreads) + initThreadPool(numThreadsFromConf) } } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala index e98d67fe843..ababb052f27 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala @@ -2822,6 +2822,12 @@ object MakeOrcTableProducer extends Logging { debugDumpPrefix: Option[String], debugDumpAlways: Boolean ): GpuDataProducer[Table] = { + debugDumpPrefix.foreach { prefix => + if (debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") + logWarning(s"Wrote data for ${splits.mkString(", ")} to $p") + } + } if (useChunkedReader) { OrcTableReader(conf, chunkSizeByteLimit, maxChunkedReaderMemoryUsageSizeBytes, parseOpts, buffer, offset, bufferSize, metrics, isSchemaCaseSensitive, readDataSchema, @@ -2838,19 +2844,17 @@ object MakeOrcTableProducer extends Logging { } catch { case e: Exception => val dumpMsg = debugDumpPrefix.map { prefix => - val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") - s", data dumped to $p" + if (!debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") + s", data dumped to $p" + } else { + "" + } }.getOrElse("") throw new IOException(s"Error when processing ${splits.mkString("; ")}$dumpMsg", e) } } closeOnExcept(table) { _ => - debugDumpPrefix.foreach { prefix => - if (debugDumpAlways) { - val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") - logWarning(s"Wrote data for ${splits.mkString(", ")} to $p") - } - } if (readDataSchema.length < table.getNumberOfColumns) { throw new QueryExecutionException(s"Expected ${readDataSchema.length} columns " + s"but read ${table.getNumberOfColumns} from ${splits.mkString("; ")}") @@ -2895,8 +2899,12 @@ case class OrcTableReader( } catch { case e: Exception => val dumpMsg = debugDumpPrefix.map { prefix => - val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") - s", data dumped to $p" + if (!debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") + s", data dumped to $p" + } else { + "" + } }.getOrElse("") throw new IOException(s"Error when processing $splitsString$dumpMsg", e) } @@ -2914,12 +2922,6 @@ case class OrcTableReader( } override def close(): Unit = { - debugDumpPrefix.foreach { prefix => - if (debugDumpAlways) { - val p = DumpUtils.dumpBuffer(conf, buffer, offset, bufferSize, prefix, ".orc") - logWarning(s"Wrote data for $splitsString to $p") - } - } reader.close() buffer.close() } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala index 7e75940869b..8df4d907f10 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala @@ -2613,6 +2613,12 @@ object MakeParquetTableProducer extends Logging { debugDumpPrefix: Option[String], debugDumpAlways: Boolean ): GpuDataProducer[Table] = { + debugDumpPrefix.foreach { prefix => + if (debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") + logWarning(s"Wrote data for ${splits.mkString(", ")} to $p") + } + } if (useChunkedReader) { ParquetTableReader(conf, chunkSizeByteLimit, maxChunkedReaderMemoryUsageSizeBytes, opts, buffer, offset, @@ -2631,19 +2637,17 @@ object MakeParquetTableProducer extends Logging { } catch { case e: Exception => val dumpMsg = debugDumpPrefix.map { prefix => - val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") - s", data dumped to $p" + if (!debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") + s", data dumped to $p" + } else { + "" + } }.getOrElse("") throw new IOException(s"Error when processing ${splits.mkString("; ")}$dumpMsg", e) } } closeOnExcept(table) { _ => - debugDumpPrefix.foreach { prefix => - if (debugDumpAlways) { - val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") - logWarning(s"Wrote data for ${splits.mkString(", ")} to $p") - } - } GpuParquetScan.throwIfRebaseNeededInExceptionMode(table, dateRebaseMode, timestampRebaseMode) if (readDataSchema.length < table.getNumberOfColumns) { @@ -2695,8 +2699,12 @@ case class ParquetTableReader( } catch { case e: Exception => val dumpMsg = debugDumpPrefix.map { prefix => - val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") - s", data dumped to $p" + if (!debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") + s", data dumped to $p" + } else { + "" + } }.getOrElse("") throw new IOException(s"Error when processing $splitsString$dumpMsg", e) } @@ -2716,12 +2724,6 @@ case class ParquetTableReader( } override def close(): Unit = { - debugDumpPrefix.foreach { prefix => - if (debugDumpAlways) { - val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet") - logWarning(s"Wrote data for $splitsString to $p") - } - } reader.close() buffer.close() } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala index ff02ab09647..78d05efb0c2 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala @@ -162,7 +162,7 @@ object GpuSemaphore { * this is considered to be okay as there are other mechanisms in place, and it should be rather * rare. */ -private final class SemaphoreTaskInfo() extends Logging { +private final class SemaphoreTaskInfo(val taskAttemptId: Long) extends Logging { /** * This holds threads that are not on the GPU yet. Most of the time they are * blocked waiting for the semaphore to let them on, but it may hold one @@ -253,7 +253,7 @@ private final class SemaphoreTaskInfo() extends Logging { if (!done && shouldBlockOnSemaphore) { // We cannot be in a synchronized block and wait on the semaphore // so we have to release it and grab it again afterwards. - semaphore.acquire(numPermits, lastHeld) + semaphore.acquire(numPermits, lastHeld, taskAttemptId) synchronized { // We now own the semaphore so we need to wake up all of the other tasks that are // waiting. @@ -280,7 +280,7 @@ private final class SemaphoreTaskInfo() extends Logging { } } - def tryAcquire(semaphore: GpuBackingSemaphore): Boolean = synchronized { + def tryAcquire(semaphore: GpuBackingSemaphore, taskAttemptId: Long): Boolean = synchronized { val t = Thread.currentThread() if (hasSemaphore) { activeThreads.add(t) @@ -288,7 +288,7 @@ private final class SemaphoreTaskInfo() extends Logging { } else { if (blockedThreads.size() == 0) { // No other threads for this task are waiting, so we might be able to grab this directly - val ret = semaphore.tryAcquire(numPermits, lastHeld) + val ret = semaphore.tryAcquire(numPermits, lastHeld, taskAttemptId) if (ret) { hasSemaphore = true activeThreads.add(t) @@ -333,9 +333,9 @@ private final class GpuSemaphore() extends Logging { val taskAttemptId = context.taskAttemptId() val taskInfo = tasks.computeIfAbsent(taskAttemptId, _ => { onTaskCompletion(context, completeTask) - new SemaphoreTaskInfo() + new SemaphoreTaskInfo(taskAttemptId) }) - if (taskInfo.tryAcquire(semaphore)) { + if (taskInfo.tryAcquire(semaphore, taskAttemptId)) { GpuDeviceManager.initializeFromTask() SemaphoreAcquired } else { @@ -357,7 +357,7 @@ private final class GpuSemaphore() extends Logging { val taskAttemptId = context.taskAttemptId() val taskInfo = tasks.computeIfAbsent(taskAttemptId, _ => { onTaskCompletion(context, completeTask) - new SemaphoreTaskInfo() + new SemaphoreTaskInfo(taskAttemptId) }) taskInfo.blockUntilReady(semaphore) GpuDeviceManager.initializeFromTask() @@ -381,6 +381,7 @@ private final class GpuSemaphore() extends Logging { def completeTask(context: TaskContext): Unit = { val taskAttemptId = context.taskAttemptId() GpuTaskMetrics.get.updateRetry(taskAttemptId) + GpuTaskMetrics.get.updateMaxMemory(taskAttemptId) val refs = tasks.remove(taskAttemptId) if (refs == null) { throw new IllegalStateException(s"Completion of unknown task $taskAttemptId") diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala index c12b8c4d5d6..344a1ae21fd 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala @@ -50,10 +50,25 @@ trait LineBufferer extends AutoCloseable { */ def getLength: Long + /** + * Get the numnber of lines currently added to this that were not filtered out. + */ + def getNumLines: Int + /** * Add a new line of bytes to the data to process. */ def add(line: Array[Byte], offset: Int, len: Int): Unit + + def isWhiteSpace(b: Byte): Boolean = { + b == ' ' || b == '\t' || b == '\r' || b == '\n' + } + + def isEmpty(line: Array[Byte], lineOffset: Int, lineLen: Int): Boolean = { + (0 until lineLen).forall { idx => + isWhiteSpace(line(lineOffset + idx)) + } + } } /** @@ -64,18 +79,27 @@ trait LineBuffererFactory[BUFF <: LineBufferer] { } object HostLineBuffererFactory extends LineBuffererFactory[HostLineBufferer] { + override def createBufferer(estimatedSize: Long, + lineSeparatorInRead: Array[Byte]): HostLineBufferer = + new HostLineBufferer(estimatedSize, lineSeparatorInRead, false) +} + +object FilterEmptyHostLineBuffererFactory extends LineBuffererFactory[HostLineBufferer] { override def createBufferer(estimatedSize: Long, lineSeparatorInRead: Array[Byte]): HostLineBufferer = - new HostLineBufferer(estimatedSize, lineSeparatorInRead) + new HostLineBufferer(estimatedSize, lineSeparatorInRead, true) } /** * Buffer the lines in a single HostMemoryBuffer with the separator inserted inbetween each of * the lines. */ -class HostLineBufferer(size: Long, separator: Array[Byte]) extends LineBufferer { +class HostLineBufferer(size: Long, + separator: Array[Byte], + filterEmpty: Boolean) extends LineBufferer { private var buffer = HostMemoryBuffer.allocate(size) private var location: Long = 0 + private var numLines: Int = 0 def grow(needed: Long): Unit = { val newSize = math.max(buffer.getLength * 2, needed) @@ -88,20 +112,21 @@ class HostLineBufferer(size: Long, separator: Array[Byte]) extends LineBufferer override def getLength: Long = location - override def add(line: Array[Byte], lineOffset: Int, lineLen: Int): Unit = { - val newTotal = location + lineLen + separator.length - if (newTotal > buffer.getLength) { - grow(newTotal) - } + override def getNumLines: Int = numLines - // Can have an empty line, do not write this to buffer but add the separator - // and totalRows - if (lineLen != 0) { + override def add(line: Array[Byte], lineOffset: Int, lineLen: Int): Unit = { + // Empty lines are filtered out + if (!filterEmpty || !isEmpty(line, lineOffset, lineLen)) { + numLines += 1 + val newTotal = location + lineLen + separator.length + if (newTotal > buffer.getLength) { + grow(newTotal) + } buffer.setBytes(location, line, lineOffset, lineLen) location = location + lineLen + buffer.setBytes(location, separator, 0, separator.length) + location = location + separator.length } - buffer.setBytes(location, separator, 0, separator.length) - location = location + separator.length } def getBufferAndRelease: HostMemoryBuffer = { @@ -139,10 +164,13 @@ class HostStringColBufferer(size: Long, separator: Array[Byte]) extends LineBuff override def getLength: Long = dataLocation + override def getNumLines: Int = numRows + override def add(line: Array[Byte], lineOffset: Int, lineLen: Int): Unit = { if (numRows + 1 > rowsAllocated) { val newRowsAllocated = math.min(rowsAllocated * 2, Int.MaxValue - 1) - val tmpBuffer = HostMemoryBuffer.allocate((newRowsAllocated + 1) * DType.INT32.getSizeInBytes) + val tmpBuffer = + HostMemoryBuffer.allocate((newRowsAllocated + 1) * DType.INT32.getSizeInBytes) tmpBuffer.copyFromHostBuffer(0, offsetsBuffer, 0, offsetsBuffer.getLength) offsetsBuffer.close() offsetsBuffer = tmpBuffer @@ -157,9 +185,7 @@ class HostStringColBufferer(size: Long, separator: Array[Byte]) extends LineBuff dataBuffer = newBuff } } - if (lineLen != 0) { - dataBuffer.setBytes(dataLocation, line, lineOffset, lineLen) - } + dataBuffer.setBytes(dataLocation, line, lineOffset, lineLen) offsetsBuffer.setInt(numRows * DType.INT32.getSizeInBytes, dataLocation.toInt) dataLocation += lineLen numRows += 1 @@ -372,7 +398,7 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf && totalSize <= maxBytesPerChunk /* soft limit and returns at least one row */) { val line = lineReader.next() hmb.add(line.getBytes, 0, line.getLength) - totalRows += 1 + totalRows = hmb.getNumLines totalSize = hmb.getLength } //Indicate this is the last chunk diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala index 09cca5285a2..8cdf846f355 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala @@ -370,6 +370,7 @@ class GpuTransitionOverrides extends Rule[SparkPlan] { case _: GpuDataSourceScanExec => true case _: DataSourceV2ScanExecBase => true case _: RDDScanExec => true // just in case an RDD was reading in data + case _: ExpandExec => true case _ => false } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 4ad126686e5..3e415e23d56 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -131,11 +131,11 @@ object RapidsPluginUtils extends Logging { val possibleRapidsJarURLs = classloader.getResources(propName).asScala.toSet.toSeq.filter { url => { val urlPath = url.toString - // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-24.10.0-spark341.jar, + // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-24.12.0-spark341.jar, // and files stored under subdirs of '!/', e.g. - // rapids-4-spark_2.12-24.10.0-cuda11.jar!/spark330/rapids4spark-version-info.properties + // rapids-4-spark_2.12-24.12.0-cuda11.jar!/spark330/rapids4spark-version-info.properties // We only want to find the main jar, e.g. - // rapids-4-spark_2.12-24.10.0-cuda11.jar!/rapids4spark-version-info.properties + // rapids-4-spark_2.12-24.12.0-cuda11.jar!/rapids4spark-version-info.properties !urlPath.contains("rapids-4-spark-") && urlPath.endsWith("!/" + propName) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PrioritySemaphore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PrioritySemaphore.scala index 6fdadf10e72..dc90382d3a0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PrioritySemaphore.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PrioritySemaphore.scala @@ -27,21 +27,30 @@ class PrioritySemaphore[T](val maxPermits: Int)(implicit ordering: Ordering[T]) private val lock = new ReentrantLock() private var occupiedSlots: Int = 0 - private case class ThreadInfo(priority: T, condition: Condition, numPermits: Int) { + private case class ThreadInfo(priority: T, condition: Condition, numPermits: Int, taskId: Long) { var signaled: Boolean = false } + // use task id as tie breaker when priorities are equal (both are 0 because never hold lock) + private val priorityComp = Ordering.by[ThreadInfo, T](_.priority).reverse. + thenComparing((a, b) => a.taskId.compareTo(b.taskId)) + // We expect a relatively small number of threads to be contending for this lock at any given // time, therefore we are not concerned with the insertion/removal time complexity. private val waitingQueue: PriorityQueue[ThreadInfo] = - new PriorityQueue[ThreadInfo](Ordering.by[ThreadInfo, T](_.priority).reverse) + new PriorityQueue[ThreadInfo](priorityComp) - def tryAcquire(numPermits: Int, priority: T): Boolean = { + def tryAcquire(numPermits: Int, priority: T, taskAttemptId: Long): Boolean = { lock.lock() try { - if (waitingQueue.size() > 0 && ordering.gt(waitingQueue.peek.priority, priority)) { + if (waitingQueue.size() > 0 && + priorityComp.compare( + waitingQueue.peek(), + ThreadInfo(priority, null, numPermits, taskAttemptId) + ) < 0) { false - } else if (!canAcquire(numPermits)) { + } + else if (!canAcquire(numPermits)) { false } else { commitAcquire(numPermits) @@ -52,12 +61,12 @@ class PrioritySemaphore[T](val maxPermits: Int)(implicit ordering: Ordering[T]) } } - def acquire(numPermits: Int, priority: T): Unit = { + def acquire(numPermits: Int, priority: T, taskAttemptId: Long): Unit = { lock.lock() try { - if (!tryAcquire(numPermits, priority)) { + if (!tryAcquire(numPermits, priority, taskAttemptId)) { val condition = lock.newCondition() - val info = ThreadInfo(priority, condition, numPermits) + val info = ThreadInfo(priority, condition, numPermits, taskAttemptId) try { waitingQueue.add(info) while (!info.signaled) { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index c3c04b9a607..c2f867f8a80 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -551,12 +551,6 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .integerConf .createWithDefault(2) - val SHUFFLE_SPILL_THREADS = conf("spark.rapids.sql.shuffle.spillThreads") - .doc("Number of threads used to spill shuffle data to disk in the background.") - .commonlyUsed() - .integerConf - .createWithDefault(6) - val GPU_BATCH_SIZE_BYTES = conf("spark.rapids.sql.batchSizeBytes") .doc("Set the target number of bytes for a GPU batch. Splits sizes for input data " + "is covered by separate configs. The maximum setting is 2 GB to avoid exceeding the " + @@ -1249,6 +1243,12 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .booleanConf .createWithDefault(true) + val ENABLE_COALESCE_AFTER_EXPAND = conf("spark.rapids.sql.coalesceAfterExpand.enabled") + .doc("When set to false disables the coalesce after GPU Expand. ") + .internal() + .booleanConf + .createWithDefault(true) + val ENABLE_ORC_FLOAT_TYPES_TO_STRING = conf("spark.rapids.sql.format.orc.floatTypesToString.enable") .doc("When reading an ORC file, the source data schemas(schemas of ORC file) may differ " + @@ -2412,7 +2412,7 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. |On startup use: `--conf [conf key]=[conf value]`. For example: | |``` - |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar \ + |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ |--conf spark.plugins=com.nvidia.spark.SQLPlugin \ |--conf spark.rapids.sql.concurrentGpuTasks=2 |``` @@ -2846,6 +2846,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isExpandPreprojectEnabled: Boolean = get(ENABLE_EXPAND_PREPROJECT) + lazy val isCoalesceAfterExpandEnabled: Boolean = get(ENABLE_COALESCE_AFTER_EXPAND) + lazy val multiThreadReadNumThreads: Int = { // Use the largest value set among all the options. val deprecatedConfs = Seq( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala index 5003ba46184..eb3692d434a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,8 +28,9 @@ import com.nvidia.spark.rapids.StorageTier.StorageTier import com.nvidia.spark.rapids.format.TableMeta import org.apache.commons.io.IOUtils +import org.apache.spark.TaskContext import org.apache.spark.sql.rapids.{GpuTaskMetrics, RapidsDiskBlockManager} -import org.apache.spark.sql.rapids.execution.SerializedHostTableUtils +import org.apache.spark.sql.rapids.execution.{SerializedHostTableUtils, TrampolineUtil} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -38,6 +39,13 @@ class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager) extends RapidsBufferStoreWithoutSpill(StorageTier.DISK) { private[this] val sharedBufferFiles = new ConcurrentHashMap[RapidsBufferId, File] + private def reportDiskAllocMetrics(metrics: GpuTaskMetrics): String = { + val taskId = TaskContext.get().taskAttemptId() + val totalSize = metrics.getDiskBytesAllocated + val maxSize = metrics.getMaxDiskBytesAllocated + s"total size for task $taskId is $totalSize, max size is $maxSize" + } + override protected def createBuffer( incoming: RapidsBuffer, catalog: RapidsBufferCatalog, @@ -58,7 +66,6 @@ class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager) } else { writeToFile(incoming, path, append = false, stream) } - logDebug(s"Spilled to $path $fileOffset:$diskLength") val buff = incoming match { case _: RapidsHostBatchBuffer => @@ -79,6 +86,12 @@ class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager) incoming.meta, incoming.getSpillPriority) } + TrampolineUtil.incTaskMetricsDiskBytesSpilled(uncompressedSize) + + val metrics = GpuTaskMetrics.get + metrics.incDiskBytesAllocated(uncompressedSize) + logDebug(s"acquiring resources for disk buffer $id of size $uncompressedSize bytes") + logDebug(reportDiskAllocMetrics(metrics)) Some(buff) } @@ -181,6 +194,11 @@ class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager) } override protected def releaseResources(): Unit = { + logDebug(s"releasing resources for disk buffer $id of size $memoryUsedBytes bytes") + val metrics = GpuTaskMetrics.get + metrics.decDiskBytesAllocated(memoryUsedBytes) + logDebug(reportDiskAllocMetrics(metrics)) + // Buffers that share paths must be cleaned up elsewhere if (id.canShareDiskPaths) { sharedBufferFiles.remove(id) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala index 32fe6229674..235ed9ddb45 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ import com.nvidia.spark.rapids.SpillPriorities.{applyPriorityOffset, HOST_MEMORY import com.nvidia.spark.rapids.StorageTier.StorageTier import com.nvidia.spark.rapids.format.TableMeta +import org.apache.spark.TaskContext import org.apache.spark.sql.rapids.GpuTaskMetrics -import org.apache.spark.sql.rapids.execution.TrampolineUtil import org.apache.spark.sql.rapids.storage.RapidsStorageUtils import org.apache.spark.sql.types.DataType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -99,8 +99,8 @@ class RapidsHostMemoryStore( } else { val amountSpilled = synchronousSpill(targetTotalSize, catalog, stream) if (amountSpilled != 0) { - logDebug(s"Spilled $amountSpilled bytes from ${name} to make room for ${buffer.id}") - TrampolineUtil.incTaskMetricsDiskBytesSpilled(amountSpilled) + logDebug(s"Task ${TaskContext.get.taskAttemptId()} spilled $amountSpilled bytes from" + + s"${name} to make room for ${buffer.id}") } // if after spill we can fit the new buffer, return true buffer.memoryUsedBytes <= (ms - currentSize) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala index 326216858c9..5251bf49390 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala @@ -26,7 +26,7 @@ import com.nvidia.spark.rapids.shims.{DistributionUtil, SparkShimImpl} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, Cast, ComplexTypeMergingExpression, Expression, QuaternaryExpression, RuntimeReplaceable, String2TrimExpression, TernaryExpression, TimeZoneAwareExpression, UnaryExpression, UTCTimestamp, WindowExpression, WindowFunction} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.catalyst.trees.TreeNodeTag +import org.apache.spark.sql.catalyst.trees.{TreeNodeTag, UnaryLike} import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec @@ -1347,7 +1347,7 @@ abstract class RuntimeReplaceableUnaryAstExprMeta[INPUT <: RuntimeReplaceable]( /** * Base class for metadata around `UnaryExpression`. */ -abstract class UnaryExprMeta[INPUT <: UnaryExpression]( +abstract class UnaryExprMeta[INPUT <: Expression with UnaryLike[Expression]]( expr: INPUT, conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala index 487611add08..891e837d7e1 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala @@ -107,8 +107,23 @@ object GpuProjectExec { // This can help avoid contiguous splits in some cases when the input data is also contiguous GpuColumnVector.incRefCounts(cb) } else { - val newColumns = boundExprs.safeMap(_.columnarEval(cb)).toArray[ColumnVector] - new ColumnarBatch(newColumns, cb.numRows()) + try { + // In some cases like Expand, we have a lot Expressions generating null vectors. + // We can cache the null vectors to avoid creating them every time. + // Since we're attempting to reuse the whole null vector, it is important to aware that + // datatype and vector length should be the same. + // Within project(cb: ColumnarBatch, boundExprs: Seq[Expression]), all output vectors share + // the same vector length, which facilitates the reuse of null vectors. + // When leaving the scope of project(cb: ColumnarBatch, boundExprs: Seq[Expression]), + // the cached null vectors will be cleared because the next ColumnBatch may have + // different vector length, thus not able to reuse cached vectors. + GpuExpressionsUtils.cachedNullVectors.get.clear() + + val newColumns = boundExprs.safeMap(_.columnarEval(cb)).toArray[ColumnVector] + new ColumnarBatch(newColumns, cb.numRows()) + } finally { + GpuExpressionsUtils.cachedNullVectors.get.clear() + } } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/GpuLore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/GpuLore.scala index a51a1e13a5e..312b277f077 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/GpuLore.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/GpuLore.scala @@ -89,23 +89,21 @@ object GpuLore { } def dumpObject[T: ClassTag](obj: T, path: Path, hadoopConf: Configuration): Unit = { - withResource(path.getFileSystem(hadoopConf)) { fs => - withResource(fs.create(path, false)) { fout => - val serializerStream = SparkEnv.get.serializer.newInstance().serializeStream(fout) - withResource(serializerStream) { ser => - ser.writeObject(obj) - } + val fs = path.getFileSystem(hadoopConf) + withResource(fs.create(path, true)) { fout => + val serializerStream = SparkEnv.get.serializer.newInstance().serializeStream(fout) + withResource(serializerStream) { ser => + ser.writeObject(obj) } } } def loadObject[T: ClassTag](path: Path, hadoopConf: Configuration): T = { - withResource(path.getFileSystem(hadoopConf)) { fs => - withResource(fs.open(path)) { fin => - val serializerStream = SparkEnv.get.serializer.newInstance().deserializeStream(fin) - withResource(serializerStream) { ser => - ser.readObject().asInstanceOf[T] - } + val fs = path.getFileSystem(hadoopConf) + withResource(fs.open(path)) { fin => + val serializerStream = SparkEnv.get.serializer.newInstance().deserializeStream(fin) + withResource(serializerStream) { ser => + ser.readObject().asInstanceOf[T] } } } @@ -186,6 +184,12 @@ object GpuLore { idGen.computeIfAbsent(executionId, _ => new AtomicInteger(0)).getAndIncrement() } } + /** + * Executions that have checked the lore output root path. + * Key is [[SQLExecution.EXECUTION_ID_KEY]]. + */ + private val loreOutputRootPathChecked: ConcurrentHashMap[String, Boolean] = + new ConcurrentHashMap[String, Boolean]() def tagForLore(sparkPlan: SparkPlan, rapidsConf: RapidsConf): SparkPlan = { val loreDumpIds = rapidsConf.loreDumpIds @@ -197,6 +201,20 @@ object GpuLore { s"when ${RapidsConf.LORE_DUMP_IDS.key} is set.")) val spark = SparkShimImpl.sessionFromPlan(sparkPlan) + + Option(spark.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)).foreach { + executionId => + loreOutputRootPathChecked.computeIfAbsent(executionId, _ => { + val path = new Path(loreOutputRootPath) + val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration) + if (fs.exists(path) && fs.listStatus(path).nonEmpty) { + throw new IllegalArgumentException( + s"LORE dump path $loreOutputRootPath already exists and is not empty.") + } + true + }) + } + val hadoopConf = { val sc = spark.sparkContext sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/dump.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/dump.scala index 1b9967e1bf4..ee0c7a7bd7a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/dump.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/lore/dump.scala @@ -72,7 +72,7 @@ class GpuLoreDumpRDD(info: LoreDumpRDDInfo, input: RDD[ColumnarBatch]) private def dumpCurrentBatch(): ColumnarBatch = { val outputPath = pathOfBatch(split.index, batchIdx) val outputStream = outputPath.getFileSystem(info.hadoopConf.value.value) - .create(outputPath, false) + .create(outputPath, true) DumpUtils.dumpToParquet(nextBatch.get, outputStream) nextBatch.get } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala index 2024fb5891d..506b22a22ab 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala @@ -319,7 +319,7 @@ object JsonPartitionReader { withResource(new NvtxWithMetrics(formatName + " decode", NvtxColor.DARK_GREEN, decodeTime)) { _ => try { - Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize) + Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize, dataBufferer.getNumLines) } catch { case e: AssertionError if e.getMessage == "CudfColumns can't be null or empty" => // this happens when every row in a JSON file is invalid (or we are @@ -344,9 +344,10 @@ class JsonPartitionReader( maxRowsPerChunk: Integer, maxBytesPerChunk: Long, execMetrics: Map[String, GpuMetric]) - extends GpuTextBasedPartitionReader[HostLineBufferer, HostLineBuffererFactory.type](conf, + extends GpuTextBasedPartitionReader[HostLineBufferer, + FilterEmptyHostLineBuffererFactory.type](conf, partFile, dataSchema, readDataSchema, parsedOptions.lineSeparatorInRead, maxRowsPerChunk, - maxBytesPerChunk, execMetrics, HostLineBuffererFactory) { + maxBytesPerChunk, execMetrics, FilterEmptyHostLineBuffererFactory) { def buildJsonOptions(parsedOptions: JSONOptions): cudf.JSONOptions = GpuJsonReadCommon.cudfJsonOptions(parsedOptions) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala index d584b3ea8a5..02153b5cb87 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -325,6 +325,12 @@ trait GpuAvroReaderBase extends Logging { self: FilePartitionReaderBase => hostBuf: HostMemoryBuffer, bufSize: Long, splits: Array[PartitionedFile]): Table = { + debugDumpPrefix.foreach { prefix => + if (debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro") + logWarning(s"Wrote data for ${splits.mkString("; ")} to $p") + } + } val readOpts = CudfAvroOptions.builder() .includeColumn(readDataSchema.fieldNames.toSeq: _*) .build() @@ -341,20 +347,16 @@ trait GpuAvroReaderBase extends Logging { self: FilePartitionReaderBase => } catch { case e: Exception => val dumpMsg = debugDumpPrefix.map { prefix => - val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro") - s", data dumped to $p" + if (!debugDumpAlways) { + val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro") + s", data dumped to $p" + } else { + "" + } }.getOrElse("") throw new IOException( s"Error when processing file splits [${splits.mkString("; ")}]$dumpMsg", e) } - closeOnExcept(table) { _ => - debugDumpPrefix.foreach { prefix => - if (debugDumpAlways) { - val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro") - logWarning(s"Wrote data for ${splits.mkString("; ")} to $p") - } - } - } table } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala index 9acc9063750..c593eebe26e 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala @@ -19,7 +19,8 @@ package org.apache.spark.sql.rapids import java.util.Locale -import ai.rapids.cudf.{BinaryOp, CaptureGroups, ColumnVector, ColumnView, DType, RegexProgram, Scalar, Schema, Table} +import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, Scalar, Schema, Table} +import com.fasterxml.jackson.core.JsonParser import com.nvidia.spark.rapids.{ColumnCastUtil, GpuCast, GpuColumnVector, GpuScalar, GpuTextBasedPartitionReader} import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingArray @@ -109,7 +110,6 @@ object GpuJsonReadCommon { private lazy val specialUnquotedFloats = Seq("NaN", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity") private lazy val specialQuotedFloats = specialUnquotedFloats.map(s => '"'+s+'"') - private lazy val allSpecialFloats = specialUnquotedFloats ++ specialQuotedFloats /** * JSON has strict rules about valid numeric formats. See https://www.json.org/ for specification. @@ -120,64 +120,41 @@ object GpuJsonReadCommon { private def sanitizeFloats(input: ColumnView, options: JSONOptions): ColumnVector = { // Note that this is not 100% consistent with Spark versions prior to Spark 3.3.0 // due to https://issues.apache.org/jira/browse/SPARK-38060 - // cuDF `isFloat` supports some inputs that are not valid JSON numbers, such as `.1`, `1.`, - // and `+1` so we use a regular expression to match valid JSON numbers instead - // TODO The majority of this validation needs to move to CUDF so that we can invalidate - // an entire line/row instead of a single field. - // https://github.com/NVIDIA/spark-rapids/issues/10534 - val jsonNumberRegexp = if (options.allowNumericLeadingZeros) { - "^-?[0-9]+(?:\\.[0-9]+)?(?:[eE][\\-\\+]?[0-9]+)?$" - } else { - "^-?(?:(?:[1-9][0-9]*)|0)(?:\\.[0-9]+)?(?:[eE][\\-\\+]?[0-9]+)?$" - } - val prog = new RegexProgram(jsonNumberRegexp, CaptureGroups.NON_CAPTURE) - val isValid = if (options.allowNonNumericNumbers) { - withResource(ColumnVector.fromStrings(allSpecialFloats: _*)) { nonNumeric => - withResource(input.matchesRe(prog)) { isJsonNumber => - withResource(input.contains(nonNumeric)) { nonNumeric => - isJsonNumber.or(nonNumeric) - } + if (options.allowNonNumericNumbers) { + // Need to normalize the quotes to non-quoted to parse properly + withResource(ColumnVector.fromStrings(specialQuotedFloats: _*)) { quoted => + withResource(ColumnVector.fromStrings(specialUnquotedFloats: _*)) { unquoted => + input.findAndReplaceAll(quoted, unquoted) } } } else { - input.matchesRe(prog) - } - val cleaned = withResource(isValid) { _ => - withResource(Scalar.fromNull(DType.STRING)) { nullString => - isValid.ifElse(input, nullString) - } + input.copyToColumnVector() } + } + + private def sanitizeInts(input: ColumnView): ColumnVector = { + // Integer numbers cannot look like a float, so no `.` or e The rest of the parsing should + // handle this correctly. The rest of the validation is in CUDF itself - withResource(cleaned) { _ => - if (options.allowNonNumericNumbers) { - // Need to normalize the quotes to non-quoted to parse properly - withResource(ColumnVector.fromStrings(specialQuotedFloats: _*)) { quoted => - withResource(ColumnVector.fromStrings(specialUnquotedFloats: _*)) { unquoted => - cleaned.findAndReplaceAll(quoted, unquoted) + val tmp = withResource(Scalar.fromString(".")) { dot => + withResource(input.stringContains(dot)) { hasDot => + withResource(Scalar.fromString("e")) { e => + withResource(input.stringContains(e)) { hase => + hasDot.or(hase) } } - } else { - cleaned.incRefCount() } } - } - - private def sanitizeInts(input: ColumnView, options: JSONOptions): ColumnVector = { - // Integer numbers cannot look like a float, so no `.` The rest of the parsing should - // handle this correctly. - // TODO The majority of this validation needs to move to CUDF so that we can invalidate - // an entire line/row instead of a single field. - // https://github.com/NVIDIA/spark-rapids/issues/10534 - val jsonNumberRegexp = if (options.allowNumericLeadingZeros) { - "^-?[0-9]+$" - } else { - "^-?(?:(?:[1-9][0-9]*)|0)$" + val invalid = withResource(tmp) { _ => + withResource(Scalar.fromString("E")) { E => + withResource(input.stringContains(E)) { hasE => + tmp.or(hasE) + } + } } - - val prog = new RegexProgram(jsonNumberRegexp, CaptureGroups.NON_CAPTURE) - withResource(input.matchesRe(prog)) { isValid => + withResource(invalid) { _ => withResource(Scalar.fromNull(DType.STRING)) { nullString => - isValid.ifElse(input, nullString) + invalid.ifElse(nullString, input) } } } @@ -194,32 +171,11 @@ object GpuJsonReadCommon { } } - private def sanitizeUnquotedDecimal(input: ColumnView, options: JSONOptions): ColumnVector = { - // For unquoted decimal values the number has to look like it is floating point before it is - // parsed, so this follows that, but without the special cases for INF/NaN - // TODO The majority of this validation needs to move to CUDF so that we can invalidate - // an entire line/row instead of a single field. - // https://github.com/NVIDIA/spark-rapids/issues/10534 - val jsonNumberRegexp = if (options.allowNumericLeadingZeros) { - "^-?[0-9]+(?:\\.[0-9]+)?(?:[eE][\\-\\+]?[0-9]+)?$" - } else { - "^-?(?:(?:[1-9][0-9]*)|0)(?:\\.[0-9]+)?(?:[eE][\\-\\+]?[0-9]+)?$" - } - val prog = new RegexProgram(jsonNumberRegexp, CaptureGroups.NON_CAPTURE) - withResource(input.matchesRe(prog)) { isValid => - withResource(Scalar.fromNull(DType.STRING)) { nullString => - isValid.ifElse(input, nullString) - } - } - } - private def sanitizeDecimal(input: ColumnView, options: JSONOptions): ColumnVector = { assert(options.locale == Locale.US) withResource(isQuotedString(input)) { isQuoted => - withResource(sanitizeUnquotedDecimal(input, options)) { unquoted => - withResource(sanitizeQuotedDecimalInUSLocale(input)) { quoted => - isQuoted.ifElse(quoted, unquoted) - } + withResource(sanitizeQuotedDecimalInUSLocale(input)) { quoted => + isQuoted.ifElse(quoted, input) } } } @@ -231,13 +187,13 @@ object GpuJsonReadCommon { } } - private def castStringToDecimal(input: ColumnVector, dt: DecimalType): ColumnVector = + private def castStringToDecimal(input: ColumnVector, dt: DecimalType): ColumnVector = { + // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 CastStrings.toDecimal(input, false, false, dt.precision, -dt.scale) + } private def castJsonStringToBool(input: ColumnView): ColumnVector = { - // TODO This validation needs to move to CUDF so that we can invalidate - // an entire line/row instead of a single field. - // https://github.com/NVIDIA/spark-rapids/issues/10534 + // Sadly there is no good kernel right now to do just this check/conversion val isTrue = withResource(Scalar.fromString("true")) { trueStr => input.equalTo(trueStr) } @@ -336,7 +292,7 @@ object GpuJsonReadCommon { case (cv, Some(dt)) if (dt == ByteType || dt == ShortType || dt == IntegerType || dt == LongType ) && cv.getType == DType.STRING => - withResource(sanitizeInts(cv, options)) { tmp => + withResource(sanitizeInts(cv)) { tmp => CastStrings.toInteger(tmp, false, GpuColumnVector.getNonNestedRapidsType(dt)) } case (cv, Some(dt)) if cv.getType == DType.STRING => @@ -363,12 +319,25 @@ object GpuJsonReadCommon { } def cudfJsonOptions(options: JSONOptions): ai.rapids.cudf.JSONOptions = { + // This is really ugly, but options.allowUnquotedControlChars is marked as private + // and this is the only way I know to get it without even uglier tricks + @scala.annotation.nowarn("msg=Java enum ALLOW_UNQUOTED_CONTROL_CHARS in " + + "Java enum Feature is deprecated") + val allowUnquotedControlChars = + options.buildJsonFactory() + .isEnabled(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS) ai.rapids.cudf.JSONOptions.builder() .withRecoverWithNull(true) .withMixedTypesAsStrings(true) .withNormalizeWhitespace(true) .withKeepQuotes(true) .withNormalizeSingleQuotes(options.allowSingleQuotes) + .withStrictValidation(true) + .withLeadingZeros(options.allowNumericLeadingZeros) + .withNonNumericNumbers(options.allowNonNumericNumbers) + .withUnquotedControlChars(allowUnquotedControlChars) + .withCudfPruneSchema(true) + .withExperimental(true) .build() } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala index 7b49a8f3351..e60aefb8d59 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala @@ -183,7 +183,7 @@ case class GpuJsonToStructs( val table = withResource(new JsonDeviceDataSource(combined)) { ds => // Step 4: Have cudf parse the JSON data try { - cudf.Table.readJSON(cudfSchema, jsonOptions, ds) + cudf.Table.readJSON(cudfSchema, jsonOptions, ds, numRows) } catch { case e : RuntimeException => throw new JsonParsingException("Currently some Json to Struct cases " + diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala index c89e26f0a24..5f1052f0e59 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,6 +78,35 @@ class NanoSecondAccumulator extends AccumulatorV2[jl.Long, NanoTime] { override def value: NanoTime = NanoTime(_sum) } +class HighWatermarkAccumulator extends AccumulatorV2[jl.Long, Long] { + private var _value = 0L + override def isZero: Boolean = _value == 0 + + override def copy(): HighWatermarkAccumulator = { + val newAcc = new HighWatermarkAccumulator + newAcc._value = this._value + newAcc + } + + override def reset(): Unit = { + _value = 0 + } + + override def add(v: jl.Long): Unit = { + _value += v + } + + override def merge(other: AccumulatorV2[jl.Long, Long]): Unit = other match { + case wa: HighWatermarkAccumulator => + _value = _value.max(wa._value) + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + + override def value: Long = _value +} + class GpuTaskMetrics extends Serializable { private val semWaitTimeNs = new NanoSecondAccumulator private val retryCount = new LongAccumulator @@ -91,6 +120,28 @@ class GpuTaskMetrics extends Serializable { private val readSpillFromHostTimeNs = new NanoSecondAccumulator private val readSpillFromDiskTimeNs = new NanoSecondAccumulator + private val maxDeviceMemoryBytes = new HighWatermarkAccumulator + private val maxDiskMemoryBytes = new HighWatermarkAccumulator + + private var diskBytesAllocated: Long = 0 + private var maxDiskBytesAllocated: Long = 0 + + def getDiskBytesAllocated: Long = diskBytesAllocated + + def getMaxDiskBytesAllocated: Long = maxDiskBytesAllocated + + def incDiskBytesAllocated(bytes: Long): Unit = { + diskBytesAllocated += bytes + maxDiskBytesAllocated = maxDiskBytesAllocated.max(diskBytesAllocated) + } + + def decDiskBytesAllocated(bytes: Long): Unit = { + diskBytesAllocated -= bytes + // For some reason it's possible for the task to start out by releasing resources, + // possibly from a previous task, in such case we probably should just ignore it. + diskBytesAllocated = diskBytesAllocated.max(0) + } + private val metrics = Map[String, AccumulatorV2[_, _]]( "gpuSemaphoreWait" -> semWaitTimeNs, "gpuRetryCount" -> retryCount, @@ -100,7 +151,9 @@ class GpuTaskMetrics extends Serializable { "gpuSpillToHostTime" -> spillToHostTimeNs, "gpuSpillToDiskTime" -> spillToDiskTimeNs, "gpuReadSpillFromHostTime" -> readSpillFromHostTimeNs, - "gpuReadSpillFromDiskTime" -> readSpillFromDiskTimeNs + "gpuReadSpillFromDiskTime" -> readSpillFromDiskTimeNs, + "gpuMaxDeviceMemoryBytes" -> maxDeviceMemoryBytes, + "gpuMaxDiskMemoryBytes" -> maxDiskMemoryBytes ) def register(sc: SparkContext): Unit = { @@ -178,6 +231,21 @@ class GpuTaskMetrics extends Serializable { retryComputationTime.add(compNs) } } + + def updateMaxMemory(taskAttemptId: Long): Unit = { + val maxMem = RmmSpark.getAndResetGpuMaxMemoryAllocated(taskAttemptId) + if (maxMem > 0) { + // These metrics track the max amount of memory that is allocated on the gpu and disk, + // respectively, during the lifespan of a task. However, this update function only gets called + // once on task completion, whereas the actual logic tracking of the max value during memory + // allocations lives in the JNI. Therefore, we can stick the convention here of calling the + // add method instead of adding a dedicated max method to the accumulator. + maxDeviceMemoryBytes.add(maxMem) + } + if (maxDiskBytesAllocated > 0) { + maxDiskMemoryBytes.add(maxDiskBytesAllocated) + } + } } /** diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala index 1b12f01f84b..afc05128ba9 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala @@ -297,6 +297,8 @@ abstract class RapidsShuffleThreadedWriterBase[K, V]( private def write(records: TimeTrackingIterator): Unit = { withResource(new NvtxRange("ThreadedWriter.write", NvtxColor.RED)) { _ => withResource(new NvtxRange("compute", NvtxColor.GREEN)) { _ => + // Timestamp when the main processing begins + val processingStart: Long = System.nanoTime() val mapOutputWriter = shuffleExecutorComponents.createMapOutputWriter( shuffleId, mapId, @@ -331,8 +333,7 @@ abstract class RapidsShuffleThreadedWriterBase[K, V]( var waitTimeOnLimiterNs: Long = 0L // Time spent computing ColumnarBatch sizes var batchSizeComputeTimeNs: Long = 0L - // Timestamp when the main processing begins - val processingStart: Long = System.nanoTime() + try { while (records.hasNext) { // get the record @@ -436,7 +437,7 @@ abstract class RapidsShuffleThreadedWriterBase[K, V]( serializationTimeMetric.foreach(_ += (serializationRatio * writeTimeNs).toLong) // we add all three here because this metric is meant to show the time // we are blocked on writes - shuffleWriteTimeMetric.foreach(_ += (openTimeNs + writeTimeNs + combineTimeNs)) + shuffleWriteTimeMetric.foreach(_ += (writeTimeNs + combineTimeNs)) shuffleCombineTimeMetric.foreach(_ += combineTimeNs) pl } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala index b675ef2bfbd..23b823e7117 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.rapids import java.util.Optional import ai.rapids.cudf -import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, Scalar, SegmentedReductionAggregation, Table} +import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, ReductionAggregation, Scalar, SegmentedReductionAggregation, Table} import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm._ import com.nvidia.spark.rapids.ArrayIndexUtils.firstIndexAndNumElementUnchecked @@ -1651,7 +1651,8 @@ object GpuSequenceUtil { def computeSequenceSize( start: ColumnVector, stop: ColumnVector, - step: ColumnVector): ColumnVector = { + step: ColumnVector, + functionName: String): ColumnVector = { checkSequenceInputs(start, stop, step) val actualSize = GetSequenceSize(start, stop, step) val sizeAsLong = withResource(actualSize) { _ => @@ -1673,7 +1674,12 @@ object GpuSequenceUtil { // check max size withResource(Scalar.fromInt(MAX_ROUNDED_ARRAY_LENGTH)) { maxLen => withResource(sizeAsLong.lessOrEqualTo(maxLen)) { allValid => - require(isAllValidTrue(allValid), GetSequenceSize.TOO_LONG_SEQUENCE) + withResource(sizeAsLong.reduce(ReductionAggregation.max())) { maxSizeScalar => + require(isAllValidTrue(allValid), + RapidsErrorUtils.getTooLongSequenceErrorString( + maxSizeScalar.getLong.asInstanceOf[Int], + functionName)) + } } } // cast to int and return @@ -1713,7 +1719,7 @@ case class GpuSequence(start: Expression, stop: Expression, stepOpt: Option[Expr val steps = stepGpuColOpt.map(_.getBase.incRefCount()) .getOrElse(defaultStepsFunc(startCol, stopCol)) closeOnExcept(steps) { _ => - (computeSequenceSize(startCol, stopCol, steps), steps) + (computeSequenceSize(startCol, stopCol, steps, prettyName), steps) } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 3169d6bc543..b04c188f7f3 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -640,7 +640,11 @@ object GpuToTimestamp { "yyyy-MM-dd HH:mm:ss" -> ParseFormatMeta(Option('-'), isTimestamp = true, raw"\A\d{4}-\d{1,2}-\d{1,2}[ T]\d{1,2}:\d{1,2}:\d{1,2}(\D|\s|\Z)"), "yyyy/MM/dd HH:mm:ss" -> ParseFormatMeta(Option('/'), isTimestamp = true, - raw"\A\d{4}/\d{1,2}/\d{1,2}[ T]\d{1,2}:\d{1,2}:\d{1,2}(\D|\s|\Z)") + raw"\A\d{4}/\d{1,2}/\d{1,2}[ T]\d{1,2}:\d{1,2}:\d{1,2}(\D|\s|\Z)"), + "yyyyMMdd" -> ParseFormatMeta(None, isTimestamp = false, + raw"\A\d{8}(\D|\s|\Z)"), + "yyyymmdd" -> ParseFormatMeta(None, isTimestamp = false, + raw"\A\d{8}(\D|\s|\Z)") ) /** remove whitespace before month and day */ @@ -762,8 +766,21 @@ object GpuToTimestamp { case RegexReplace(pattern, backref) => RegexReplace(pattern.replace('-', '/'), backref.replace('-', '/')) } - case Some('-') | Some(_) | None => + case Some('-') | Some(_) => regexReplaceRules + case None => + // For formats like `yyyyMMdd` that do not contains separator, + // do not need to do regexp replacement rules + // Note: here introduced the following inconsistent behavior compared to Spark + // Spark's behavior: + // to_date('20240101', 'yyyyMMdd') = 2024-01-01 + // to_date('202401 01', 'yyyyMMdd') = 2024-01-01 + // to_date('2024 0101', 'yyyyMMdd') = null + // GPU behavior: + // to_date('20240101', 'yyyyMMdd') = 2024-01-01 + // to_date('202401 01', 'yyyyMMdd') = null + // to_date('2024 0101', 'yyyyMMdd') = null + Seq() } // apply each rule in turn to the data diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala index 20414ef0ad5..f5004484680 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala @@ -123,7 +123,12 @@ object TrampolineUtil { * @param amountSpilled amount of memory spilled in bytes */ def incTaskMetricsDiskBytesSpilled(amountSpilled: Long): Unit = { - Option(TaskContext.get).foreach(_.taskMetrics().incDiskBytesSpilled(amountSpilled)) + Option(TaskContext.get).foreach(tc => { + val metrics = tc.taskMetrics() + if (metrics != null) { + metrics.incDiskBytesSpilled(amountSpilled) + } + }) } /** diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala index e9dca9497be..183028874d2 100644 --- a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala @@ -40,10 +40,7 @@ package com.nvidia.spark.rapids.shims import ai.rapids.cudf._ import com.nvidia.spark.rapids.Arm._ -import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH - object GetSequenceSize { - val TOO_LONG_SEQUENCE = s"Too long sequence found. Should be <= $MAX_ROUNDED_ARRAY_LENGTH" /** * Compute the size of each sequence according to 'start', 'stop' and 'step'. * A row (Row[start, stop, step]) contains at least one null element will produce diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala new file mode 100644 index 00000000000..ddabbeaf1a3 --- /dev/null +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "350"} +{"spark": "350db"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + +trait SequenceSizeTooLongErrorBuilder { + + def getTooLongSequenceErrorString(sequenceSize: Int, functionName: String): String = { + // For these Spark versions, the sequence length and function name + // do not appear in the exception message. + s"Too long sequence found. Should be <= $MAX_ROUNDED_ARRAY_LENGTH" + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index 68a6ce30569..dd387d453b5 100644 --- a/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.trees.Origin import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} -object RapidsErrorUtils extends RapidsQueryErrorUtils { +object RapidsErrorUtils extends RapidsQueryErrorUtils with SequenceSizeTooLongErrorBuilder { def invalidArrayIndexError(index: Int, numElements: Int, isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { // Follow the Spark string format before 3.3.0 diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index e5cdcd43568..a08f38e5596 100644 --- a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -21,64 +21,9 @@ {"spark": "332"} {"spark": "332cdh"} {"spark": "333"} -{"spark": "334"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.shims -import org.apache.spark.SparkDateTimeException -import org.apache.spark.sql.catalyst.trees.Origin -import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} +object RapidsErrorUtils extends RapidsErrorUtils330To334Base + with SequenceSizeTooLongErrorBuilder -object RapidsErrorUtils extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { - - def mapKeyNotExistError( - key: String, - keyType: DataType, - origin: Origin): NoSuchElementException = { - QueryExecutionErrors.mapKeyNotExistError(key, keyType, origin.context) - } - - def invalidArrayIndexError(index: Int, numElements: Int, - isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { - if (isElementAtF) { - QueryExecutionErrors.invalidElementAtIndexError(index, numElements) - } else { - QueryExecutionErrors.invalidArrayIndexError(index, numElements) - } - } - - def arithmeticOverflowError( - message: String, - hint: String = "", - errorContext: String = ""): ArithmeticException = { - QueryExecutionErrors.arithmeticOverflowError(message, hint, errorContext) - } - - def cannotChangeDecimalPrecisionError( - value: Decimal, - toType: DecimalType, - context: String = ""): ArithmeticException = { - QueryExecutionErrors.cannotChangeDecimalPrecisionError( - value, toType.precision, toType.scale, context - ) - } - - def overflowInIntegralDivideError(context: String = ""): ArithmeticException = { - QueryExecutionErrors.arithmeticOverflowError( - "Overflow in integral divide", "try_divide", context - ) - } - - def sparkDateTimeException(infOrNan: String): SparkDateTimeException = { - // These are the arguments required by SparkDateTimeException class to create error message. - val errorClass = "CAST_INVALID_INPUT" - val messageParameters = Array("DOUBLE", "TIMESTAMP", SQLConf.ANSI_ENABLED.key) - new SparkDateTimeException(errorClass, Array(infOrNan) ++ messageParameters) - } - - def sqlArrayIndexNotStartAtOneError(): RuntimeException = { - new ArrayIndexOutOfBoundsException("SQL array indices start at 1") - } -} diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils330To334Base.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils330To334Base.scala new file mode 100644 index 00000000000..5e560faf90c --- /dev/null +++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils330To334Base.scala @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "333"} +{"spark": "334"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import org.apache.spark.SparkDateTimeException +import org.apache.spark.sql.catalyst.trees.Origin +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} + +trait RapidsErrorUtils330To334Base extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { + + def mapKeyNotExistError( + key: String, + keyType: DataType, + origin: Origin): NoSuchElementException = { + QueryExecutionErrors.mapKeyNotExistError(key, keyType, origin.context) + } + + def invalidArrayIndexError(index: Int, numElements: Int, + isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { + if (isElementAtF) { + QueryExecutionErrors.invalidElementAtIndexError(index, numElements) + } else { + QueryExecutionErrors.invalidArrayIndexError(index, numElements) + } + } + + def arithmeticOverflowError( + message: String, + hint: String = "", + errorContext: String = ""): ArithmeticException = { + QueryExecutionErrors.arithmeticOverflowError(message, hint, errorContext) + } + + def cannotChangeDecimalPrecisionError( + value: Decimal, + toType: DecimalType, + context: String = ""): ArithmeticException = { + QueryExecutionErrors.cannotChangeDecimalPrecisionError( + value, toType.precision, toType.scale, context + ) + } + + def overflowInIntegralDivideError(context: String = ""): ArithmeticException = { + QueryExecutionErrors.arithmeticOverflowError( + "Overflow in integral divide", "try_divide", context + ) + } + + def sparkDateTimeException(infOrNan: String): SparkDateTimeException = { + // These are the arguments required by SparkDateTimeException class to create error message. + val errorClass = "CAST_INVALID_INPUT" + val messageParameters = Array("DOUBLE", "TIMESTAMP", SQLConf.ANSI_ENABLED.key) + new SparkDateTimeException(errorClass, Array(infOrNan) ++ messageParameters) + } + + def sqlArrayIndexNotStartAtOneError(): RuntimeException = { + new ArrayIndexOutOfBoundsException("SQL array indices start at 1") + } +} diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala index e8a27aaecc8..cedaee9fe69 100644 --- a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala +++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala @@ -21,12 +21,10 @@ spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids -import scala.util.Try - object DatabricksShimServiceProvider { val log = org.slf4j.LoggerFactory.getLogger(getClass().getName().stripSuffix("$")) def matchesVersion(dbrVersion: String): Boolean = { - Try { + try { val sparkBuildInfo = org.apache.spark.BuildInfo val databricksBuildInfo = com.databricks.BuildInfo val matchRes = sparkBuildInfo.dbrVersion.startsWith(dbrVersion) @@ -44,10 +42,10 @@ object DatabricksShimServiceProvider { log.debug(logMessage) } matchRes - }.recover { + } catch { case x: Throwable => log.debug("Databricks detection failed: " + x, x) false - }.getOrElse(false) + } } } diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index 7e58a54c921..1b9bafff947 100644 --- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -22,7 +22,8 @@ package org.apache.spark.sql.rapids.shims import org.apache.spark.sql.errors.QueryExecutionErrors -object RapidsErrorUtils extends RapidsErrorUtilsBase with RapidsQueryErrorUtils { +object RapidsErrorUtils extends RapidsErrorUtilsBase + with RapidsQueryErrorUtils with SequenceSizeTooLongErrorBuilder { def sqlArrayIndexNotStartAtOneError(): RuntimeException = { QueryExecutionErrors.elementAtByIndexZeroError(context = null) } diff --git a/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala b/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala index aba0f465483..f386973200a 100644 --- a/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala +++ b/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/GetSequenceSize.scala @@ -31,8 +31,6 @@ import org.apache.spark.sql.rapids.{AddOverflowChecks, SubtractOverflowChecks} import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH object GetSequenceSize { - val TOO_LONG_SEQUENCE = "Unsuccessful try to create array with elements exceeding the array " + - s"size limit $MAX_ROUNDED_ARRAY_LENGTH" /** * Compute the size of each sequence according to 'start', 'stop' and 'step'. * A row (Row[start, stop, step]) contains at least one null element will produce diff --git a/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala new file mode 100644 index 00000000000..b91c5ed360b --- /dev/null +++ b/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "334"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +object RapidsErrorUtils extends RapidsErrorUtils330To334Base + with SequenceSizeTooLongUnsuccessfulErrorBuilder + diff --git a/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala b/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala new file mode 100644 index 00000000000..5e584de7167 --- /dev/null +++ b/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "334"} +{"spark": "342"} +{"spark": "343"} +{"spark": "351"} +{"spark": "352"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + +trait SequenceSizeTooLongUnsuccessfulErrorBuilder { + def getTooLongSequenceErrorString(sequenceSize: Int, functionName: String): String = { + // The errant function's name does not feature in the exception message + // prior to Spark 4.0. Neither does the attempted allocation size. + "Unsuccessful try to create array with elements exceeding the array " + + s"size limit $MAX_ROUNDED_ARRAY_LENGTH" + } +} diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index 0bf3e66d556..815e8d9dbb0 100644 --- a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -17,80 +17,9 @@ /*** spark-rapids-shim-json-lines {"spark": "340"} {"spark": "341"} -{"spark": "342"} -{"spark": "343"} {"spark": "350"} -{"spark": "351"} -{"spark": "352"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.shims -import org.apache.spark.SparkDateTimeException -import org.apache.spark.sql.catalyst.trees.{Origin, SQLQueryContext} -import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} - -object RapidsErrorUtils extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { - - def mapKeyNotExistError( - key: String, - keyType: DataType, - origin: Origin): NoSuchElementException = { - throw new UnsupportedOperationException( - "`mapKeyNotExistError` has been removed since Spark 3.4.0. " - ) - } - - def invalidArrayIndexError( - index: Int, - numElements: Int, - isElementAtF: Boolean = false, - context: SQLQueryContext = null): ArrayIndexOutOfBoundsException = { - if (isElementAtF) { - QueryExecutionErrors.invalidElementAtIndexError(index, numElements, context) - } else { - QueryExecutionErrors.invalidArrayIndexError(index, numElements, context) - } - } - - def arithmeticOverflowError( - message: String, - hint: String = "", - errorContext: SQLQueryContext = null): ArithmeticException = { - QueryExecutionErrors.arithmeticOverflowError(message, hint, errorContext) - } - - def cannotChangeDecimalPrecisionError( - value: Decimal, - toType: DecimalType, - context: SQLQueryContext = null): ArithmeticException = { - QueryExecutionErrors.cannotChangeDecimalPrecisionError( - value, toType.precision, toType.scale, context - ) - } - - def overflowInIntegralDivideError(context: SQLQueryContext = null): ArithmeticException = { - QueryExecutionErrors.arithmeticOverflowError( - "Overflow in integral divide", "try_divide", context - ) - } - - def sparkDateTimeException(infOrNan: String): SparkDateTimeException = { - // These are the arguments required by SparkDateTimeException class to create error message. - val errorClass = "CAST_INVALID_INPUT" - val messageParameters = Map("expression" -> infOrNan, "sourceType" -> "DOUBLE", - "targetType" -> "TIMESTAMP", "ansiConfig" -> SQLConf.ANSI_ENABLED.key) - SparkDateTimeExceptionShims.newSparkDateTimeException(errorClass, messageParameters, - Array.empty, "") - } - - def sqlArrayIndexNotStartAtOneError(): RuntimeException = { - QueryExecutionErrors.invalidIndexOfZeroError(context = null) - } - - override def intervalDivByZeroError(origin: Origin): ArithmeticException = { - QueryExecutionErrors.intervalDividedByZeroError(origin.context) - } -} +object RapidsErrorUtils extends RapidsErrorUtils340PlusBase + with SequenceSizeTooLongErrorBuilder diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils340PlusBase.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils340PlusBase.scala new file mode 100644 index 00000000000..366cbb289c7 --- /dev/null +++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils340PlusBase.scala @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "340"} +{"spark": "341"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +{"spark": "352"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import org.apache.spark.SparkDateTimeException +import org.apache.spark.sql.catalyst.trees.{Origin, SQLQueryContext} +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} + +trait RapidsErrorUtils340PlusBase extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { + + def mapKeyNotExistError( + key: String, + keyType: DataType, + origin: Origin): NoSuchElementException = { + throw new UnsupportedOperationException( + "`mapKeyNotExistError` has been removed since Spark 3.4.0. " + ) + } + + def invalidArrayIndexError( + index: Int, + numElements: Int, + isElementAtF: Boolean = false, + context: SQLQueryContext = null): ArrayIndexOutOfBoundsException = { + if (isElementAtF) { + QueryExecutionErrors.invalidElementAtIndexError(index, numElements, context) + } else { + QueryExecutionErrors.invalidArrayIndexError(index, numElements, context) + } + } + + def arithmeticOverflowError( + message: String, + hint: String = "", + errorContext: SQLQueryContext = null): ArithmeticException = { + QueryExecutionErrors.arithmeticOverflowError(message, hint, errorContext) + } + + def cannotChangeDecimalPrecisionError( + value: Decimal, + toType: DecimalType, + context: SQLQueryContext = null): ArithmeticException = { + QueryExecutionErrors.cannotChangeDecimalPrecisionError( + value, toType.precision, toType.scale, context + ) + } + + def overflowInIntegralDivideError(context: SQLQueryContext = null): ArithmeticException = { + QueryExecutionErrors.arithmeticOverflowError( + "Overflow in integral divide", "try_divide", context + ) + } + + def sparkDateTimeException(infOrNan: String): SparkDateTimeException = { + // These are the arguments required by SparkDateTimeException class to create error message. + val errorClass = "CAST_INVALID_INPUT" + val messageParameters = Map("expression" -> infOrNan, "sourceType" -> "DOUBLE", + "targetType" -> "TIMESTAMP", "ansiConfig" -> SQLConf.ANSI_ENABLED.key) + SparkDateTimeExceptionShims.newSparkDateTimeException(errorClass, messageParameters, + Array.empty, "") + } + + def sqlArrayIndexNotStartAtOneError(): RuntimeException = { + QueryExecutionErrors.invalidIndexOfZeroError(context = null) + } + + override def intervalDivByZeroError(origin: Origin): ArithmeticException = { + QueryExecutionErrors.intervalDividedByZeroError(origin.context) + } +} diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index a1490eb0959..70e440c07ec 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -22,7 +22,8 @@ package org.apache.spark.sql.rapids.shims import org.apache.spark.sql.errors.QueryExecutionErrors -object RapidsErrorUtils extends RapidsErrorUtilsBase with RapidsQueryErrorUtils { +object RapidsErrorUtils extends RapidsErrorUtilsBase + with RapidsQueryErrorUtils with SequenceSizeTooLongErrorBuilder { def sqlArrayIndexNotStartAtOneError(): RuntimeException = { QueryExecutionErrors.invalidIndexOfZeroError(context = null) } diff --git a/sql-plugin/src/main/spark342/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark342/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala new file mode 100644 index 00000000000..a1c038e1148 --- /dev/null +++ b/sql-plugin/src/main/spark342/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "342"} +{"spark": "343"} +{"spark": "351"} +{"spark": "352"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +object RapidsErrorUtils extends RapidsErrorUtils340PlusBase + with SequenceSizeTooLongUnsuccessfulErrorBuilder diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala new file mode 100644 index 00000000000..51f56f612fd --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +object RapidsErrorUtils extends RapidsErrorUtils340PlusBase + with SequenceSizeExceededLimitErrorBuilder diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala new file mode 100644 index 00000000000..741634aea3f --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import org.apache.spark.sql.errors.QueryExecutionErrors + +trait SequenceSizeExceededLimitErrorBuilder { + def getTooLongSequenceErrorString(sequenceSize: Int, functionName: String): String = { + QueryExecutionErrors.createArrayWithElementsExceedLimitError(functionName, sequenceSize) + .getMessage + } +} diff --git a/tests/pom.xml b/tests/pom.xml index ff4ecd83acb..a8fef6b7930 100644 --- a/tests/pom.xml +++ b/tests/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-tests_2.12 RAPIDS Accelerator for Apache Spark Tests RAPIDS plugin for Apache Spark integration tests - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT tests @@ -156,6 +156,19 @@ 3.1.0.0-RC2 test + + org.apache.parquet + parquet-column + ${parquet.hadoop.version} + test + tests + + + org.apache.parquet + parquet-avro + ${parquet.hadoop.version} + test + diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala index 1db21ca4f58..47546f25513 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.types._ class JsonScanRetrySuite extends RmmSparkRetrySuiteBase { test("test simple retry") { - val bufferer = HostLineBuffererFactory.createBufferer(100, Array('\n'.toByte)) + val bufferer = FilterEmptyHostLineBuffererFactory.createBufferer(100, Array('\n'.toByte)) bufferer.add("{\"a\": 1, \"b\": 2".getBytes, 0, 14) val cudfSchema = GpuColumnVector.from(StructType(Seq(StructField("a", IntegerType), diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/PrioritySemaphoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/PrioritySemaphoreSuite.scala index 0ba125f60ab..7199aa55df6 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/PrioritySemaphoreSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/PrioritySemaphoreSuite.scala @@ -26,24 +26,24 @@ class PrioritySemaphoreSuite extends AnyFunSuite { test("tryAcquire should return true if permits are available") { val semaphore = new TestPrioritySemaphore(10) - assert(semaphore.tryAcquire(5, 0)) - assert(semaphore.tryAcquire(3, 0)) - assert(semaphore.tryAcquire(2, 0)) - assert(!semaphore.tryAcquire(1, 0)) + assert(semaphore.tryAcquire(5, 0, 0)) + assert(semaphore.tryAcquire(3, 0, 0)) + assert(semaphore.tryAcquire(2, 0, 0)) + assert(!semaphore.tryAcquire(1, 0, 0)) } test("acquire and release should work correctly") { val semaphore = new TestPrioritySemaphore(1) - assert(semaphore.tryAcquire(1, 0)) + assert(semaphore.tryAcquire(1, 0, 0)) val t = new Thread(() => { try { - semaphore.acquire(1, 1) + semaphore.acquire(1, 1, 0) fail("Should not acquire permit") } catch { case _: InterruptedException => - semaphore.acquire(1, 1) + semaphore.acquire(1, 1, 0) } }) t.start() @@ -62,7 +62,7 @@ class PrioritySemaphoreSuite extends AnyFunSuite { def taskWithPriority(priority: Int) = new Runnable { override def run(): Unit = { - semaphore.acquire(1, priority) + semaphore.acquire(1, priority, 0) results.add(priority) semaphore.release(1) } @@ -84,9 +84,9 @@ class PrioritySemaphoreSuite extends AnyFunSuite { test("low priority thread cannot surpass high priority thread") { val semaphore = new TestPrioritySemaphore(10) - semaphore.acquire(5, 0) + semaphore.acquire(5, 0, 0) val t = new Thread(() => { - semaphore.acquire(10, 2) + semaphore.acquire(10, 2, 0) semaphore.release(10) }) t.start() @@ -94,10 +94,36 @@ class PrioritySemaphoreSuite extends AnyFunSuite { // Here, there should be 5 available permits, but a thread with higher priority (2) // is waiting to acquire, therefore we should get rejected here - assert(!semaphore.tryAcquire(5, 0)) + assert(!semaphore.tryAcquire(5, 0, 0)) semaphore.release(5) t.join(1000) // After the high priority thread finishes, we can acquire with lower priority - assert(semaphore.tryAcquire(5, 0)) + assert(semaphore.tryAcquire(5, 0, 0)) + } + + // this case is described at https://github.com/NVIDIA/spark-rapids/pull/11574/files#r1795652488 + test("thread with larger task id should not surpass smaller task id in the waiting queue") { + val semaphore = new TestPrioritySemaphore(10) + semaphore.acquire(8, 0, 0) + val t = new Thread(() => { + semaphore.acquire(5, 0, 0) + semaphore.release(5) + }) + t.start() + Thread.sleep(100) + + // Here, there should be 2 available permits, and a thread with same task id (0) + // is waiting to acquire 5 permits, in this case we should succeed here + assert(semaphore.tryAcquire(2, 0, 0)) + semaphore.release(2) + + // Here, there should be 2 available permits, but a thread with smaller task id (0) + // is waiting to acquire, therefore we should get rejected here + assert(!semaphore.tryAcquire(2, 0, 1)) + + semaphore.release(8) + t.join(1000) + // After the high priority thread finishes, we can acquire with lower priority + assert(semaphore.tryAcquire(2, 0, 1)) } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index a60ea50ef4e..e1c06a88fa1 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -418,7 +418,8 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } - test("replace_replace - ?, *, +, and {0, n} repetitions") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("replace_replace - ?, *, +, and {0, n} repetitions") { val patterns = Seq("D?", "D*", "D+", "D{0,}", "D{0,1}", "D{0,5}", "[1a-zA-Z]{0,}", "[1a-zA-Z]{0,2}", "A+") val inputs = Seq("SS", "DD", "SDSDSDS", "DDDD", "DDDDDD", "ABCDEFG") @@ -710,23 +711,27 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } } - test("AST fuzz test - regexp_find") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("AST fuzz test - regexp_find") { doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND, RegexFindMode) } - test("AST fuzz test - regexp_replace") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("AST fuzz test - regexp_replace") { doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE, RegexReplaceMode) } - test("AST fuzz test - regexp_find - full unicode input") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("AST fuzz test - regexp_find - full unicode input") { assume(isUnicodeEnabled()) doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexFindMode) } - test("AST fuzz test - regexp_replace - full unicode input") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("AST fuzz test - regexp_replace - full unicode input") { assume(isUnicodeEnabled()) doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexReplaceMode) @@ -736,7 +741,8 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { Charset.defaultCharset().name() == "UTF-8" } - test("AST fuzz test - regexp_find - anchor focused") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("AST fuzz test - regexp_find - anchor focused") { doAstFuzzTest(validDataChars = Some("\r\nabc"), validPatternChars = "^$\\AZz\r\n()[]-", mode = RegexFindMode) } @@ -778,7 +784,8 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } } - test("regexp_split - repetition with {0,n}, or {0,}") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("regexp_split - repetition with {0,n}, or {0,}") { // see https://github.com/NVIDIA/spark-rapids/issues/6958 val patterns = Set("ba{0,}", raw"a\02{0,}", "ba{0,2}", raw"b\02{0,10}") val data = Seq("abaa", "baba", "ba\u0002b", "ab\u0002b\u0002a") @@ -832,7 +839,8 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } } - test("string split fuzz") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("string split fuzz") { val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode) for (limit <- Seq(-2, -1, 2, 5)) { @@ -840,7 +848,8 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } } - test("string split fuzz - anchor focused") { + // Disabling until https://github.com/NVIDIA/spark-rapids/issues/11600 is fixed + ignore("string split fuzz - anchor focused") { val (data, patterns) = generateDataAndPatterns(validDataChars = Some("\r\nabc"), validPatternChars = "^$\\AZz\r\n()", RegexSplitMode) doStringSplitTest(patterns, data, -1) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/lore/GpuLoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/lore/GpuLoreSuite.scala index 7db46718e89..057fbd7ecc3 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/lore/GpuLoreSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/lore/GpuLoreSuite.scala @@ -17,6 +17,7 @@ package com.nvidia.spark.rapids.lore import com.nvidia.spark.rapids.{FunSuiteWithTempDir, GpuColumnarToRowExec, RapidsConf, SparkQueryCompareTestSuite} +import com.nvidia.spark.rapids.Arm.withResource import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging @@ -147,6 +148,26 @@ class GpuLoreSuite extends SparkQueryCompareTestSuite with FunSuiteWithTempDir w } } + test("Non-empty lore dump path") { + withGpuSparkSession{ spark => + spark.conf.set(RapidsConf.LORE_DUMP_PATH.key, TEST_FILES_ROOT.getAbsolutePath) + spark.conf.set(RapidsConf.LORE_DUMP_IDS.key, "3[*]") + + //Create a file in the root path + val path = new Path(s"${TEST_FILES_ROOT.getAbsolutePath}/test") + val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration) + withResource(fs.create(path, true)) { _ => + } + + val df = spark.range(0, 1000, 1, 100) + .selectExpr("id % 10 as key", "id % 100 as value") + + assertThrows[IllegalArgumentException] { + df.collect() + } + } + } + private def doTestReplay(loreDumpIds: String)(dfFunc: SparkSession => DataFrame) = { val loreId = OutputLoreId.parse(loreDumpIds).head._1 withGpuSparkSession { spark => diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetAvroCompatibilitySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetAvroCompatibilitySuite.scala new file mode 100644 index 00000000000..b7075fade2f --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetAvroCompatibilitySuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetAvroCompatibilitySuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetAvroCompatibilitySuite + extends ParquetAvroCompatibilitySuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetColumnIndexSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetColumnIndexSuite.scala new file mode 100644 index 00000000000..5f234df5850 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetColumnIndexSuite.scala @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetColumnIndexSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetColumnIndexSuite extends ParquetColumnIndexSuite with RapidsSQLTestsBaseTrait { +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetCompressionCodecPrecedenceSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetCompressionCodecPrecedenceSuite.scala new file mode 100644 index 00000000000..c5d31ad6664 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetCompressionCodecPrecedenceSuite.scala @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetCompressionCodecPrecedenceSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetCompressionCodecPrecedenceSuite + extends ParquetCompressionCodecPrecedenceSuite + with RapidsSQLTestsBaseTrait { +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaByteArrayEncodingSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaByteArrayEncodingSuite.scala new file mode 100644 index 00000000000..351050cc158 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaByteArrayEncodingSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetDeltaByteArrayEncodingSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetDeltaByteArrayEncodingSuite + extends ParquetDeltaByteArrayEncodingSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaEncodingSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaEncodingSuite.scala new file mode 100644 index 00000000000..82597361230 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaEncodingSuite.scala @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.{ParquetDeltaEncodingInteger, ParquetDeltaEncodingLong} +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetDeltaEncodingInteger + extends ParquetDeltaEncodingInteger + with RapidsSQLTestsBaseTrait {} + +class RapidsParquetDeltaEncodingLong + extends ParquetDeltaEncodingLong + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaLengthByteArrayEncodingSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaLengthByteArrayEncodingSuite.scala new file mode 100644 index 00000000000..95b3b451068 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetDeltaLengthByteArrayEncodingSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines + {"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetDeltaLengthByteArrayEncodingSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetDeltaLengthByteArrayEncodingSuite + extends ParquetDeltaLengthByteArrayEncodingSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdIOSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdIOSuite.scala new file mode 100644 index 00000000000..cf04e5eaa9c --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdIOSuite.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetFieldIdIOSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetFieldIdIOSuite extends ParquetFieldIdIOSuite with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdSchemaSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdSchemaSuite.scala new file mode 100644 index 00000000000..49d9fc1eec6 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFieldIdSchemaSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetFieldIdSchemaSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetFieldIdSchemaSuite + extends ParquetFieldIdSchemaSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFileFormatSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFileFormatSuite.scala new file mode 100644 index 00000000000..24c9390b426 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetFileFormatSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetFileFormatSuite + extends ParquetFileFormatSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetInteroperabilitySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetInteroperabilitySuite.scala new file mode 100644 index 00000000000..eb22c1e3019 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetInteroperabilitySuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetInteroperabilitySuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetInteroperabilitySuite + extends ParquetInteroperabilitySuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetPartitionDiscoverySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetPartitionDiscoverySuite.scala new file mode 100644 index 00000000000..66ed9e785e6 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetPartitionDiscoverySuite.scala @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.{ParquetPartitionDiscoverySuite, ParquetV1PartitionDiscoverySuite, ParquetV2PartitionDiscoverySuite} +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetPartitionDiscoverySuite + extends ParquetPartitionDiscoverySuite + with RapidsSQLTestsBaseTrait {} + +class RapidsParquetV1PartitionDiscoverySuite + extends ParquetV1PartitionDiscoverySuite + with RapidsSQLTestsBaseTrait {} + +class RapidsParquetV2PartitionDiscoverySuite + extends ParquetV2PartitionDiscoverySuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetProtobufCompatibilitySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetProtobufCompatibilitySuite.scala new file mode 100644 index 00000000000..f7833ec1d7d --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetProtobufCompatibilitySuite.scala @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.execution.datasources.parquet.ParquetProtobufCompatibilitySuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetProtobufCompatibilitySuite + extends ParquetProtobufCompatibilitySuite + with RapidsSQLTestsBaseTrait { + override protected def readResourceParquetFile(name: String): DataFrame = { + spark.read.parquet(testFile(name)) + } +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetQuerySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetQuerySuite.scala new file mode 100644 index 00000000000..6dfc44d4bfa --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetQuerySuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetQuerySuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetQuerySuite + extends ParquetQuerySuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetRebaseDatetimeSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetRebaseDatetimeSuite.scala new file mode 100644 index 00000000000..4e7cd659954 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetRebaseDatetimeSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetRebaseDatetimeSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetRebaseDatetimeSuite + extends ParquetRebaseDatetimeSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaPruningSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaPruningSuite.scala new file mode 100644 index 00000000000..689448fb7f0 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaPruningSuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaPruningSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetSchemaPruningSuite + extends ParquetSchemaPruningSuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaSuite.scala new file mode 100644 index 00000000000..e579844ae4b --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetSchemaSuite.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetSchemaSuite extends ParquetSchemaSuite with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetThriftCompatibilitySuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetThriftCompatibilitySuite.scala new file mode 100644 index 00000000000..5353bd139c3 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetThriftCompatibilitySuite.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetThriftCompatibilitySuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetThriftCompatibilitySuite + extends ParquetThriftCompatibilitySuite + with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetVectorizedSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetVectorizedSuite.scala new file mode 100644 index 00000000000..32dc8c15827 --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsParquetVectorizedSuite.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.execution.datasources.parquet.ParquetVectorizedSuite +import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait + +class RapidsParquetVectorizedSuite extends ParquetVectorizedSuite with RapidsSQLTestsBaseTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index db59c67f3dd..8b76e350fef 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -19,7 +19,7 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.utils -import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonExpressionsSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite} +import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonExpressionsSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsParquetAvroCompatibilitySuite, RapidsParquetColumnIndexSuite, RapidsParquetCompressionCodecPrecedenceSuite, RapidsParquetDeltaByteArrayEncodingSuite, RapidsParquetDeltaEncodingInteger, RapidsParquetDeltaEncodingLong, RapidsParquetDeltaLengthByteArrayEncodingSuite, RapidsParquetFieldIdIOSuite, RapidsParquetFieldIdSchemaSuite, RapidsParquetFileFormatSuite, RapidsParquetInteroperabilitySuite, RapidsParquetPartitionDiscoverySuite, RapidsParquetProtobufCompatibilitySuite, RapidsParquetQuerySuite, RapidsParquetRebaseDatetimeSuite, RapidsParquetSchemaPruningSuite, RapidsParquetSchemaSuite, RapidsParquetThriftCompatibilitySuite, RapidsParquetVectorizedSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite} // Some settings' line length exceeds 100 // scalastyle:off line.size.limit @@ -72,6 +72,47 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsMathFunctionsSuite] .exclude("SPARK-33428 conv function shouldn't raise error if input string is too big", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11142")) .exclude("SPARK-36229 conv should return result equal to -1 in base of toBase", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11142")) + enableSuite[RapidsParquetAvroCompatibilitySuite] + .exclude("SPARK-10136 array of primitive array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11401")) + enableSuite[RapidsParquetColumnIndexSuite] + enableSuite[RapidsParquetCompressionCodecPrecedenceSuite] + .exclude("Create parquet table with compression", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11416")) + enableSuite[RapidsParquetDeltaByteArrayEncodingSuite] + enableSuite[RapidsParquetDeltaEncodingInteger] + enableSuite[RapidsParquetDeltaEncodingLong] + enableSuite[RapidsParquetDeltaLengthByteArrayEncodingSuite] + enableSuite[RapidsParquetFileFormatSuite] + .excludeByPrefix("Propagate Hadoop configs from", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11402")) + enableSuite[RapidsParquetFieldIdIOSuite] + enableSuite[RapidsParquetFieldIdSchemaSuite] + enableSuite[RapidsParquetInteroperabilitySuite] + .exclude("SPARK-36803: parquet files with legacy mode and schema evolution", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11454")) + .exclude("parquet timestamp conversion", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11448")) + enableSuite[RapidsParquetPartitionDiscoverySuite] + .exclude("Various partition value types", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11430")) + enableSuite[RapidsParquetProtobufCompatibilitySuite] + .exclude("struct with unannotated array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11379")) + .exclude("unannotated array of struct with unannotated array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11379")) + enableSuite[RapidsParquetQuerySuite] + .exclude("SPARK-26677: negated null-safe equality comparison should not filter matched row groups", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11403")) + .exclude("SPARK-34212 Parquet should read decimals correctly", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11433")) + enableSuite[RapidsParquetRebaseDatetimeSuite] + .exclude("SPARK-31159, SPARK-37705: compatibility with Spark 2.4/3.2 in reading dates/timestamps", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11404")) + .exclude("SPARK-31159, SPARK-37705: rebasing timestamps in write", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11404")) + .exclude("SPARK-31159: rebasing dates in write", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11404")) + .exclude("SPARK-35427: datetime rebasing in the EXCEPTION mode", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11404")) + enableSuite[RapidsParquetSchemaPruningSuite] + .excludeByPrefix("Spark vectorized reader", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11405")) + .excludeByPrefix("Non-vectorized reader", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11405")) + .excludeByPrefix("Case-insensitive parser", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11405")) + .excludeByPrefix("Case-sensitive parser", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11405")) + enableSuite[RapidsParquetSchemaSuite] + .exclude("schema mismatch failure error message for parquet reader", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11434")) + .exclude("schema mismatch failure error message for parquet vectorized reader", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11446")) + enableSuite[RapidsParquetThriftCompatibilitySuite] + .exclude("Read Parquet file generated by parquet-thrift", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11381")) + .exclude("SPARK-10136 list of primitive list", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11381")) + enableSuite[RapidsParquetVectorizedSuite] enableSuite[RapidsRegexpExpressionsSuite] enableSuite[RapidsStringExpressionsSuite] .exclude("SPARK-22550: Elt should not generate codes beyond 64KB", WONT_FIX_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) diff --git a/tools/pom.xml b/tools/pom.xml index 69512e966e1..13f960505a7 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-tools-support pom RAPIDS Accelerator for Apache Spark Tools Support Supporting code for RAPIDS Accelerator tools - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT com.nvidia diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml index e50984c5470..afe827baf78 100644 --- a/udf-compiler/pom.xml +++ b/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-udf_2.12 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT udf-compiler