diff --git a/java/source/c_data.rst b/java/source/c_data.rst new file mode 100644 index 00000000..4716c1a7 --- /dev/null +++ b/java/source/c_data.rst @@ -0,0 +1,29 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _c-data: + +================ +C Data Interface +================ + +The `Arrow C Data Interface `_ enables zero-copy sharing of Arrow data between language +runtimes. A Java programme can seamlessly work with C++ and Python programs. +The following examples demonstrates how it can be done. + +:ref:`Python Java ` +------------------------ diff --git a/java/source/index.rst b/java/source/index.rst index 63f94c0d..ece43c38 100644 --- a/java/source/index.rst +++ b/java/source/index.rst @@ -43,6 +43,7 @@ This cookbook is tested with Apache Arrow |version|. data avro jdbc + c_data Indices and tables ================== diff --git a/java/source/python_java.rst b/java/source/python_java.rst new file mode 100644 index 00000000..ba295f01 --- /dev/null +++ b/java/source/python_java.rst @@ -0,0 +1,279 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _arrow-python-java: + +======================== +PyArrow Java Integration +======================== + +The PyArrow library offers a powerful API for Python that can be integrated with Java applications. +This document provides a guide on how to enable seamless data exchange between Python and Java components using PyArrow. + +.. contents:: + +Dictionary Data Roundtrip +========================= + +This section demonstrates a data roundtrip where C Data interface is being used to provide +the seamless access to data across language boundaries. + + +Python Component +---------------- + +In the Python-based component, the data roundtrip process is demonstrated through a sequential workflow. + +1. Create data in Python +2. Export data to Java +3. Import updated data from Java +4. Validate the data consistency + +The Python code uses `jpype `_ to start the JVM and make the Java class MapValuesConsumer available to Python. +Data is generated in PyArrow and exported through C Data to Java. + +.. code-block:: python + + import jpype + import jpype.imports + from jpype.types import JClass + import pyarrow as pa + from pyarrow.cffi import ffi as arrow_c + + # Init the JVM and make MapValuesConsumer class available to Python. + jpype.startJVM(classpath=[ "../target/*"]) + java_c_package = jpype.JPackage("org").apache.arrow.c + MapValuesConsumer = JClass('MapValuesConsumer') + CDataDictionaryProvider = JClass('org.apache.arrow.c.CDataDictionaryProvider') + + # Starting from Python and generating data + # Create a Python DictionaryArray + dictionary = pa.dictionary(pa.int64(), pa.utf8()) + array = pa.array(["A", "B", "C", "A", "D"], dictionary) + print("From Python") + print("Dictionary Created:", array) + + # create the CDataDictionaryProvider instance which is + # required to create dictionary array precisely + c_provider = CDataDictionaryProvider() + consumer = MapValuesConsumer(c_provider) + + # Export the Python array through C Data + c_array = arrow_c.new("struct ArrowArray*") + c_array_ptr = int(arrow_c.cast("uintptr_t", c_array)) + array._export_to_c(c_array_ptr) + + # Export the Schema of the Array through C Data + c_schema = arrow_c.new("struct ArrowSchema*") + c_schema_ptr = int(arrow_c.cast("uintptr_t", c_schema)) + array.type._export_to_c(c_schema_ptr) + + # Send Array and its Schema to the Java function + consumer.callToJava(c_array_ptr, c_schema_ptr) + + # Importing updated values from Java to Python + # Export the Python array through C Data + c_array_from_java = arrow_c.new("struct ArrowArray*") + c_array_ptr_from_java = int(arrow_c.cast("uintptr_t", c_array_from_java)) + + # Export the Schema of the Array through C Data + c_schema_from_java = arrow_c.new("struct ArrowSchema*") + c_schema_ptr_from_java = int(arrow_c.cast("uintptr_t", c_schema_from_java)) + java_wrapped_array = java_c_package.ArrowArray.wrap(c_array_ptr_from_java) + java_wrapped_schema = java_c_package.ArrowSchema.wrap(c_schema_ptr_from_java) + java_c_package.Data.exportVector( + consumer.getAllocatorForJavaConsumer(), + consumer.getVector(), + c_provider, + java_wrapped_array, + java_wrapped_schema + ) + + print("From Java back to Python") + array_from_java = pa.Array._import_from_c(c_array_ptr_from_java, c_schema_ptr_from_java) + + # In Java and Python, the same memory is being accessed through the C Data interface. + # Since the array from Java and array created in Python should have same data. + + assert array_from_java.equals(array) + print("Array from Java: ", array_from_java) + + # Releasing Java C Data source. + del array_from_java + + consumer.close() + + jpype.shutdownJVM() + + +.. code-block:: shell + + From Python + Dictionary Created: + -- dictionary: + [ + "A", + "B", + "C", + "D" + ] + -- indices: + [ + 0, + 1, + 2, + 0, + 3 + ] + Doing work in Java + From Java back to Python + Array from Java: + -- dictionary: + [ + "A", + "B", + "C", + "D" + ] + -- indices: + [ + 2, + 1, + 2, + 0, + 3 + ] + +Java Component +-------------- + +In the Java-based component of the system, the following operations are executed: + +1. Receives data from the Python component. +2. Updates the data. +3. Exports the updated data back to Python. + +MapValuesConsumer class uses C Data interface to access the data created in Python. + +.. testcode:: + + import org.apache.arrow.c.ArrowArray; + import org.apache.arrow.c.ArrowSchema; + import org.apache.arrow.c.Data; + import org.apache.arrow.c.CDataDictionaryProvider; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.BigIntVector; + import org.apache.arrow.util.AutoCloseables; + + + class MapValuesConsumer implements AutoCloseable { + private final BufferAllocator allocator; + private final CDataDictionaryProvider provider; + private FieldVector vector; + private final BigIntVector intVector; + + + public MapValuesConsumer(CDataDictionaryProvider provider, BufferAllocator allocator) { + this.provider = provider; + this.allocator = allocator; + this.intVector = new BigIntVector("internal_test_vector", allocator); + } + + public BufferAllocator getAllocatorForJavaConsumer() { + return allocator; + } + + public FieldVector getVector() { + return this.vector; + } + + public void update(long c_array_ptr, long c_schema_ptr) { + ArrowArray arrow_array = ArrowArray.wrap(c_array_ptr); + ArrowSchema arrow_schema = ArrowSchema.wrap(c_schema_ptr); + this.vector = Data.importVector(allocator, arrow_array, arrow_schema, this.provider); + this.doWorkInJava(vector); + } + + public FieldVector updateFromJava(long c_array_ptr, long c_schema_ptr) { + ArrowArray arrow_array = ArrowArray.wrap(c_array_ptr); + ArrowSchema arrow_schema = ArrowSchema.wrap(c_schema_ptr); + this.vector = Data.importVector(allocator, arrow_array, arrow_schema, this.provider); + this.doWorkInJava(vector); + return vector; + } + + private void doWorkInJava(FieldVector vector) { + System.out.println("Doing work in Java"); + BigIntVector bigIntVector = (BigIntVector)vector; + bigIntVector.setSafe(0, 2); + } + + public BigIntVector getIntVectorForJavaConsumer() { + intVector.allocateNew(3); + intVector.set(0, 1); + intVector.set(1, 7); + intVector.set(2, 93); + intVector.setValueCount(3); + return intVector; + } + + @Override + public void close() throws Exception { + AutoCloseables.close(intVector); + } + } + try (BufferAllocator allocator = new RootAllocator()) { + CDataDictionaryProvider provider = new CDataDictionaryProvider(); + try (final MapValuesConsumer mvc = new MapValuesConsumer(provider, allocator)) { + try ( + ArrowArray arrowArray = ArrowArray.allocateNew(allocator); + ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator) + ) { + Data.exportVector(allocator, mvc.getIntVectorForJavaConsumer(), provider, arrowArray, + arrowSchema); + FieldVector updatedVector = mvc.updateFromJava(arrowArray.memoryAddress(), + arrowSchema.memoryAddress()); + try (ArrowArray usedArray = ArrowArray.allocateNew(allocator); + ArrowSchema usedSchema = ArrowSchema.allocateNew(allocator)) { + Data.exportVector(allocator, updatedVector, provider, usedArray, usedSchema); + try (FieldVector valueVectors = Data.importVector(allocator, usedArray, usedSchema, + provider)) { + System.out.println(valueVectors); + } + } + updatedVector.close(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } catch (Exception ex) { + ex.printStackTrace(); + } + } catch (Exception ex) { + ex.printStackTrace(); + } + + +.. testoutput:: + + Doing work in Java + [2, 7, 93] + + +By integrating PyArrow in Python and Java components, this example demonstrates that +a system can be created where data is shared and updated across both languages seamlessly.