diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4b3c31e6b3b0..cd2f89874e1a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -209,8 +209,6 @@ jobs: run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions - name: Check datafusion (parquet) run: cargo check --profile ci --no-default-features -p datafusion --features=parquet - - name: Check datafusion (pyarrow) - run: cargo check --profile ci --no-default-features -p datafusion --features=pyarrow - name: Check datafusion (regex_expressions) run: cargo check --profile ci --no-default-features -p datafusion --features=regex_expressions - name: Check datafusion (recursive_protection) @@ -572,30 +570,6 @@ jobs: shell: bash run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests - test-datafusion-pyarrow: - name: cargo test pyarrow (amd64) - needs: linux-build-lib - runs-on: ubuntu-latest - container: - image: amd64/rust:bullseye # Use the bullseye tag image which comes with python3.9 - steps: - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - with: - submodules: true - fetch-depth: 1 - - name: Install PyArrow - run: | - echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - apt-get update - apt-get install python3-pip -y - python3 -m pip install pyarrow - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: stable - - name: Run datafusion-common tests - run: cargo test --profile ci -p datafusion-common --features=pyarrow,sql - vendor: name: Verify Vendored Code runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index f500265108ff..6c43db0424ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -238,7 +238,6 @@ dependencies = [ "arrow-ipc", "arrow-json", "arrow-ord", - "arrow-pyarrow", "arrow-row", "arrow-schema", "arrow-select", @@ -422,18 +421,6 @@ dependencies = [ "arrow-select", ] -[[package]] -name = "arrow-pyarrow" -version = "57.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" -dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", - "pyo3", -] - [[package]] name = "arrow-row" version = "57.0.0" @@ -1970,7 +1957,6 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "tokio", ] [[package]] @@ -2024,7 +2010,6 @@ dependencies = [ "object_store", "parquet", "paste", - "pyo3", "rand 0.9.2", "recursive", "sqlparser", @@ -3765,12 +3750,6 @@ dependencies = [ "web-time", ] -[[package]] -name = "indoc" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" - [[package]] name = "insta" version = "1.43.2" @@ -4113,15 +4092,6 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "mimalloc" version = "0.1.48" @@ -4911,67 +4881,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "pyo3" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" -dependencies = [ - "indoc", - "libc", - "memoffset", - "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" -dependencies = [ - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn 2.0.108", -] - [[package]] name = "quad-rand" version = "0.2.3" @@ -6203,12 +6112,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" -[[package]] -name = "target-lexicon" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" - [[package]] name = "tempfile" version = "3.23.0" @@ -6806,12 +6709,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - [[package]] name = "unit-prefix" version = "0.5.1" diff --git a/README.md b/README.md index 5191496eaafe..5c55b2b15efa 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,6 @@ Optional features: - `avro`: support for reading the [Apache Avro] format - `backtrace`: include backtrace information in error messages - `parquet_encryption`: support for using [Parquet Modular Encryption] -- `pyarrow`: conversions between PyArrow and DataFusion types - `serde`: enable arrow-schema's `serde` feature [apache avro]: https://avro.apache.org/ diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh index 6a00ad810956..aa994bc2b8c8 100755 --- a/ci/scripts/rust_clippy.sh +++ b/ci/scripts/rust_clippy.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings \ No newline at end of file +cargo clippy --all-targets --workspace --features avro,integration-tests,extended_tests -- -D warnings diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 4eaeed675a20..bf5c0dc9a82f 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -46,7 +46,6 @@ futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } -tokio = { workspace = true } [dev-dependencies] datafusion-datasource-parquet = { workspace = true } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index a9eb0f2220c6..c5f8972ff0c1 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -45,7 +45,6 @@ parquet_encryption = [ "parquet/encryption", "dep:hex", ] -pyarrow = ["pyo3", "arrow/pyarrow", "parquet"] force_hash_collisions = [] recursive_protection = ["dep:recursive"] parquet = ["dep:parquet"] @@ -71,7 +70,6 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index c8d5a30ee3e0..c8172820e8ba 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -34,8 +34,6 @@ mod dfschema; mod functional_dependencies; mod join_type; mod param_value; -#[cfg(feature = "pyarrow")] -mod pyarrow; mod schema_reference; mod table_reference; mod unnest; diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs deleted file mode 100644 index 18c6739735ff..000000000000 --- a/datafusion/common/src/pyarrow.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Conversions between PyArrow and DataFusion types - -use arrow::array::{Array, ArrayData}; -use arrow::pyarrow::{FromPyArrow, ToPyArrow}; -use pyo3::exceptions::PyException; -use pyo3::prelude::PyErr; -use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyResult, Python}; - -use crate::{DataFusionError, ScalarValue}; - -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { - PyException::new_err(err.to_string()) - } -} - -impl FromPyArrow for ScalarValue { - fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { - let py = value.py(); - let typ = value.getattr("type")?; - let val = value.call_method0("as_py")?; - - // construct pyarrow array from the python value and pyarrow type - let factory = py.import("pyarrow")?.getattr("array")?; - let args = PyList::new(py, [val])?; - let array = factory.call1((args, typ))?; - - // convert the pyarrow array to rust array using C data interface - let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?); - let scalar = ScalarValue::try_from_array(&array, 0)?; - - Ok(scalar) - } -} - -impl ToPyArrow for ScalarValue { - fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { - let array = self.to_array()?; - // convert to pyarrow array using C data interface - let pyarray = array.to_data().to_pyarrow(py)?; - let pyscalar = pyarray.call_method1("__getitem__", (0,))?; - - Ok(pyscalar) - } -} - -impl<'source> FromPyObject<'source> for ScalarValue { - fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { - Self::from_pyarrow_bound(value) - } -} - -impl<'source> IntoPyObject<'source> for ScalarValue { - type Target = PyAny; - - type Output = Bound<'source, Self::Target>; - - type Error = PyErr; - - fn into_pyobject(self, py: Python<'source>) -> Result { - let array = self.to_array()?; - // convert to pyarrow array using C data interface - let pyarray = array.to_data().to_pyarrow(py)?; - pyarray.call_method1("__getitem__", (0,)) - } -} - -#[cfg(test)] -mod tests { - use pyo3::ffi::c_str; - use pyo3::py_run; - use pyo3::types::PyDict; - use pyo3::Python; - - use super::*; - - fn init_python() { - Python::initialize(); - Python::attach(|py| { - if py.run(c_str!("import pyarrow"), None, None).is_err() { - let locals = PyDict::new(py); - py.run( - c_str!( - "import sys; executable = sys.executable; python_path = sys.path" - ), - None, - Some(&locals), - ) - .expect("Couldn't get python info"); - let executable = locals.get_item("executable").unwrap(); - let executable: String = executable.extract().unwrap(); - - let python_path = locals.get_item("python_path").unwrap(); - let python_path: Vec = python_path.extract().unwrap(); - - panic!("pyarrow not found\nExecutable: {executable}\nPython path: {python_path:?}\n\ - HINT: try `pip install pyarrow`\n\ - NOTE: On Mac OS, you must compile against a Framework Python \ - (default in python.org installers and brew, but not pyenv)\n\ - NOTE: On Mac OS, PYO3 might point to incorrect Python library \ - path when using virtual environments. Try \ - `export PYTHONPATH=$(python -c \"import sys; print(sys.path[-1])\")`\n") - } - }) - } - - #[test] - fn test_roundtrip() { - init_python(); - - let example_scalars = [ - ScalarValue::Boolean(Some(true)), - ScalarValue::Int32(Some(23)), - ScalarValue::Float64(Some(12.34)), - ScalarValue::from("Hello!"), - ScalarValue::Date32(Some(1234)), - ]; - - Python::attach(|py| { - for scalar in example_scalars.iter() { - let result = - ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap()) - .unwrap(); - assert_eq!(scalar, &result); - } - }); - } - - #[test] - fn test_py_scalar() -> PyResult<()> { - init_python(); - - Python::attach(|py| -> PyResult<()> { - let scalar_float = ScalarValue::Float64(Some(12.34)); - let py_float = scalar_float - .into_pyobject(py)? - .call_method0("as_py") - .unwrap(); - py_run!(py, py_float, "assert py_float == 12.34"); - - let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string())); - let py_string = scalar_string - .into_pyobject(py)? - .call_method0("as_py") - .unwrap(); - py_run!(py, py_string, "assert py_string == 'Hello!'"); - - Ok(()) - }) - } -} diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index f672e3a94681..8de020aa2fb1 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -76,7 +76,6 @@ parquet_encryption = [ "datafusion-common/parquet_encryption", "datafusion-datasource-parquet/parquet_encryption", ] -pyarrow = ["datafusion-common/pyarrow", "parquet"] regex_expressions = [ "datafusion-functions/regex_expressions", ] diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 0b227000f73d..d8735d655282 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -19,6 +19,17 @@ # Upgrade Guides +## DataFusion `52.0.0` + +**Note:** DataFusion `52.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. + +You can see the current [status of the `52.0.0`release here](https://github.com/apache/datafusion/issues/18566) + +### Removal of `pyarrow` feature + +The `pyarrow` feature flag has been removed. This feature has been migrated to +the `datafusion-python` repository since version `44.0.0`. + ## DataFusion `51.0.0` **Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version.