forked from apecloud/myduckserver
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add compatibility test for python data tools, fix some compatib…
…ility bugs (apecloud#209)
- Loading branch information
1 parent
c2372a8
commit 6a052cb
Showing
10 changed files
with
378 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import io | ||
|
||
import pandas as pd | ||
import pyarrow as pa | ||
import polars as pl | ||
import psycopg | ||
|
||
# Create a pandas DataFrame | ||
data = { | ||
'id': [1, 2, 3], | ||
'num': [100, 200, 300], | ||
'data': ['aaa', 'bbb', 'ccc'] | ||
} | ||
df = pd.DataFrame(data) | ||
|
||
# Convert the DataFrame to an Arrow Table | ||
table = pa.Table.from_pandas(df) | ||
|
||
with psycopg.connect("dbname=postgres user=postgres host=127.0.0.1 port=5432", autocommit=True) as conn: | ||
with conn.cursor() as cur: | ||
cur.execute("DROP SCHEMA IF EXISTS test CASCADE") | ||
cur.execute("CREATE SCHEMA test") | ||
|
||
# Create a new table | ||
cur.execute(""" | ||
CREATE TABLE test.tb1 ( | ||
id integer PRIMARY KEY, | ||
num integer, | ||
data text) | ||
""") | ||
|
||
# Use psycopg to write the DataFrame to MyDuck Server | ||
output_stream = io.BytesIO() | ||
with pa.ipc.RecordBatchStreamWriter(output_stream, table.schema) as writer: | ||
writer.write_table(table) | ||
with cur.copy("COPY test.tb1 FROM STDIN (FORMAT arrow)") as copy: | ||
copy.write(output_stream.getvalue()) | ||
|
||
# Copy the data from MyDuck Server back into a pandas DataFrame using Arrow format | ||
arrow_data = io.BytesIO() | ||
with cur.copy("COPY test.tb1 TO STDOUT (FORMAT arrow)") as copy: | ||
for block in copy: | ||
arrow_data.write(block) | ||
|
||
# Read the Arrow data into a Polars DataFrame | ||
with pa.ipc.open_stream(arrow_data.getvalue()) as reader: | ||
arrow_df = reader.read_all() | ||
polars_df = pl.from_arrow(arrow_df) | ||
|
||
# Convert the original pandas DataFrame to Polars DataFrame for comparison | ||
polars_df_original = pl.from_pandas(df) | ||
|
||
# Compare the original Polars DataFrame with the DataFrame from PostgreSQL | ||
assert polars_df.equals(polars_df_original), "DataFrames are not equal" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from psycopg import sql | ||
import psycopg | ||
|
||
rows = [ | ||
(1, 100, "aaa"), | ||
(2, 200, "bbb"), | ||
(3, 300, "ccc"), | ||
(4, 400, "ddd"), | ||
(5, 500, "eee"), | ||
] | ||
|
||
# Connect to an existing database | ||
with psycopg.connect("dbname=postgres user=postgres host=127.0.0.1 port=5432", autocommit=True) as conn: | ||
# Open a cursor to perform database operations | ||
with conn.cursor() as cur: | ||
cur.execute("DROP SCHEMA IF EXISTS test CASCADE") | ||
cur.execute("CREATE SCHEMA test") | ||
|
||
cur.execute(""" | ||
CREATE TABLE test.tb1 ( | ||
id integer PRIMARY KEY, | ||
num integer, | ||
data text) | ||
""") | ||
|
||
|
||
# Pass data to fill a query placeholders and let Psycopg perform the correct conversion | ||
cur.execute( | ||
"INSERT INTO test.tb1 (id, num, data) VALUES (%s, %s, %s)", | ||
rows[0]) | ||
|
||
# Query the database and obtain data as Python objects | ||
cur.execute("SELECT * FROM test.tb1") | ||
row = cur.fetchone() | ||
assert row == rows[0], "Row is not equal" | ||
|
||
# Copy data from a file-like object to a table | ||
print("Copy data from a file-like object to a table") | ||
with cur.copy("COPY test.tb1 (id, num, data) FROM STDIN") as copy: | ||
for row in rows[1:3]: | ||
copy.write(f"{row[0]}\t{row[1]}\t{row[2]}\n".encode()) | ||
for row in rows[3:]: | ||
copy.write_row(row) | ||
|
||
# Copy data from a table to a file-like object | ||
print("Copy data from a table to a file-like object") | ||
with cur.copy( | ||
"COPY (SELECT * FROM test.tb1 LIMIT %s) TO STDOUT", | ||
(4,) | ||
) as copy: | ||
copy.set_types(["int4", "int4", "text"]) | ||
for i, row in enumerate(copy.rows()): | ||
assert row == rows[i], f"Row {i} is not equal" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import io | ||
|
||
import pandas as pd | ||
import pyarrow as pa | ||
import psycopg | ||
|
||
# Create a pandas DataFrame | ||
data = { | ||
'id': [1, 2, 3], | ||
'num': [100, 200, 300], | ||
'data': ['aaa', 'bbb', 'ccc'] | ||
} | ||
df = pd.DataFrame(data) | ||
|
||
# Convert the DataFrame to an Arrow Table | ||
table = pa.Table.from_pandas(df) | ||
|
||
with psycopg.connect("dbname=postgres user=postgres host=127.0.0.1 port=5432", autocommit=True) as conn: | ||
with conn.cursor() as cur: | ||
cur.execute("DROP SCHEMA IF EXISTS test CASCADE") | ||
cur.execute("CREATE SCHEMA test") | ||
|
||
# Create a new table | ||
cur.execute(""" | ||
CREATE TABLE test.tb1 ( | ||
id integer PRIMARY KEY, | ||
num integer, | ||
data text) | ||
""") | ||
|
||
# Use psycopg to write the DataFrame to MyDuck Server | ||
output_stream = io.BytesIO() | ||
with pa.ipc.RecordBatchStreamWriter(output_stream, table.schema) as writer: | ||
writer.write_table(table) | ||
with cur.copy("COPY test.tb1 FROM STDIN (FORMAT arrow)") as copy: | ||
copy.write(output_stream.getvalue()) | ||
|
||
# Copy the data from MyDuck Server back into a pandas DataFrame using Arrow format | ||
arrow_data = io.BytesIO() | ||
with cur.copy("COPY test.tb1 TO STDOUT (FORMAT arrow)") as copy: | ||
for block in copy: | ||
arrow_data.write(block) | ||
|
||
# Read the Arrow data into a pandas DataFrame | ||
with pa.ipc.open_stream(arrow_data.getvalue()) as reader: | ||
df_from_pg = reader.read_pandas() | ||
df = df.astype({'id': 'int64', 'num': 'int64'}) | ||
df_from_pg = df_from_pg.astype({'id': 'int64', 'num': 'int64'}) | ||
# Compare the original DataFrame with the DataFrame from PostgreSQL | ||
assert df.equals(df_from_pg), "DataFrames are not equal" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env bats | ||
|
||
setup() { | ||
psql -h 127.0.0.1 -p 5432 -U postgres -c "DROP SCHEMA IF EXISTS test CASCADE;" | ||
touch /tmp/test_pids | ||
} | ||
|
||
custom_teardown="" | ||
|
||
set_custom_teardown() { | ||
custom_teardown="$1" | ||
} | ||
|
||
teardown() { | ||
if [ -n "$custom_teardown" ]; then | ||
eval "$custom_teardown" | ||
custom_teardown="" | ||
fi | ||
|
||
while read -r pid; do | ||
if kill -0 "$pid" 2>/dev/null; then | ||
kill "$pid" | ||
wait "$pid" 2>/dev/null | ||
fi | ||
done < /tmp/test_pids | ||
rm /tmp/test_pids | ||
} | ||
|
||
start_process() { | ||
run timeout 2m "$@" | ||
echo $! >> /tmp/test_pids | ||
if [ "$status" -ne 0 ]; then | ||
echo "$output" | ||
echo "$stderr" | ||
fi | ||
[ "$status" -eq 0 ] | ||
} | ||
|
||
@test "pg-psycopg" { | ||
start_process python3 $BATS_TEST_DIRNAME/psycopg_test.py | ||
} | ||
|
||
@test "pg-pyarrow" { | ||
start_process python3 $BATS_TEST_DIRNAME/pyarrow_test.py | ||
} | ||
|
||
@test "pg-polars" { | ||
start_process python3 $BATS_TEST_DIRNAME/polars_test.py | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Tutorial: Accessing MyDuck Server with PostgreSQL using psycopg, pyarrow, and polars | ||
|
||
## 0. Connecting to MyDuck Server using psycopg | ||
|
||
`psycopg` is a popular PostgreSQL adapter for Python. Here is how you can connect to MyDuck Server using `psycopg`: | ||
|
||
```python | ||
import psycopg | ||
with psycopg.connect("dbname=postgres user=postgres host=127.0.0.1 port=5432", autocommit=True) as conn: | ||
with conn.cursor() as cur: | ||
... | ||
``` | ||
|
||
## 1. Using COPY Operation for Direct Interaction | ||
|
||
The `COPY` command in PostgreSQL is a powerful tool for bulk data transfer. Here is how you can use it with the `psycopg` library to interact directly with MyDuck Server: | ||
|
||
### Writing Data Directly | ||
|
||
```python | ||
with cur.copy("COPY test.tb1 (id, num, data) FROM STDIN") as copy: | ||
copy.write(b"1\t100\taaa\n") | ||
``` | ||
|
||
### Writing Data Row by Row | ||
|
||
```python | ||
with cur.copy("COPY test.tb1 (id, num, data) FROM STDIN") as copy: | ||
copy.write_row((1, 100, "aaa")) | ||
``` | ||
|
||
### Reading Data Directly | ||
|
||
```python | ||
with cur.copy("COPY test.tb1 TO STDOUT") as copy: | ||
for block in copy: | ||
print(block) | ||
``` | ||
|
||
### Reading Data Row by Row | ||
|
||
```python | ||
with cur.copy("COPY test.tb1 TO STDOUT") as copy: | ||
for row in copy.rows(): | ||
print(row) | ||
``` | ||
|
||
## 2. Importing and Exporting Data in pyarrow Format | ||
|
||
`pyarrow` allows efficient data interchange between pandas DataFrames and MyDuck Server. Here is how to import and export data in `pyarrow` format: | ||
|
||
### Creating a pandas DataFrame and Converting to Arrow Table | ||
|
||
```python | ||
data = { | ||
'id': [1, 2, 3], | ||
'num': [100, 200, 300], | ||
'data': ['aaa', 'bbb', 'ccc'] | ||
} | ||
df = pd.DataFrame(data) | ||
table = pa.Table.from_pandas(df) | ||
``` | ||
|
||
### Writing Data to MyDuck Server in Arrow Format | ||
|
||
```python | ||
output_stream = io.BytesIO() | ||
with pa.ipc.RecordBatchStreamWriter(output_stream, table.schema) as writer: | ||
writer.write_table(table) | ||
with cur.copy("COPY test.tb1 FROM STDIN (FORMAT arrow)") as copy: | ||
copy.write(output_stream.getvalue()) | ||
``` | ||
|
||
### Reading Data from MyDuck Server in Arrow Format | ||
|
||
```python | ||
arrow_data = io.BytesIO() | ||
with cur.copy("COPY test.tb1 TO STDOUT (FORMAT arrow)") as copy: | ||
for block in copy: | ||
arrow_data.write(block) | ||
print(arrow_data.getvalue()) | ||
``` | ||
|
||
### Converting Arrow Data to Arrow DataFrame | ||
|
||
```python | ||
with pa.ipc.open_stream(arrow_data.getvalue()) as reader: | ||
arrow_df = reader.read_all() | ||
print(arrow_df) | ||
``` | ||
|
||
### Converting Arrow Data to Pandas DataFrame | ||
|
||
```python | ||
with pa.ipc.open_stream(arrow_data.getvalue()) as reader: | ||
pandas_df = reader.read_pandas() | ||
print(pandas_df) | ||
``` | ||
|
||
## 3. Using polars to Convert pyarrow Format Data | ||
|
||
`polars` is a fast DataFrame library that can work with `pyarrow` data. Here is how to use `polars` to convert `pyarrow` format data: | ||
|
||
### Converting Pandas DataFrame to polars DataFrame | ||
|
||
```python | ||
polars_df = pl.from_pandas(pandas_df) | ||
``` | ||
|
||
### Converting Arrow DataFrame to polars DataFrame | ||
|
||
```python | ||
polars_df = pl.from_arrow(arrow_df) | ||
``` |
Oops, something went wrong.