-
Notifications
You must be signed in to change notification settings - Fork 198
Issue 627 parquet output #683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 5 commits
8b302aa
3ed6af4
aebd345
2331fa7
5bdd2a7
407c5ed
f592993
d6562e9
35cbf59
2a81e48
03cec66
20b7e07
7425a60
e82ca7f
cfad14b
56071d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
||
| from __future__ import absolute_import, print_function, division | ||
|
|
||
| # standard library dependencies | ||
| from petl.compat import PY2 | ||
|
||
| from petl.io.pandas import fromdataframe, todataframe | ||
| # internal dependencies | ||
| from petl.util.base import Table | ||
| from petl.io.sources import read_source_from_arg, write_source_from_arg | ||
|
|
||
|
|
||
| # third-party dependencies | ||
| import pandas as pd | ||
|
||
|
|
||
|
|
||
| def fromparquet(source=None, **kwargs): | ||
| """ | ||
| Extract data from a Parquet file and return as a PETL table. | ||
| The input can be a local filesystem path or any URL supported by fsspec (e.g., S3, GCS). | ||
| Example: | ||
| >>> import petl as etl | ||
| >>> # read a Parquet file into a PETL table | ||
| ... table = etl.fromparquet('data/example.parquet') | ||
| >>> table | ||
| +-------+------+ | ||
| | name | age | | ||
| +=======+======+ | ||
| | 'Amy' | 22 | | ||
| +-------+------+ | ||
| | 'Bob' | 34 | | ||
| +-------+------+ | ||
| :param source: path or URL to Parquet file | ||
| :param kwargs: passed through to pandas.read_parquet | ||
| :returns: a PETL Table | ||
| """ | ||
|
|
||
| src = read_source_from_arg(source) | ||
| with src.open('rb') as f: | ||
|
||
| df = pd.read_parquet(f, **kwargs) | ||
|
||
| return fromdataframe(df) | ||
|
|
||
| def toparquet(table, source=None, **kwargs): | ||
| """ | ||
| Write a PETL table or pandas DataFrame out to a Parquet file via pandas. | ||
| :param table_or_df: PETL table or pandas DataFrame | ||
| :param source: filesystem path or fsspec-supported URL for output | ||
| :param kwargs: passed through to pandas.DataFrame.to_parquet | ||
| :returns: the original PETL Table or pandas DataFrame | ||
| """ | ||
| src = write_source_from_arg(source) | ||
| with src.open('wb') as f: | ||
|
||
| df = df = todataframe(table) | ||
|
||
| df.to_parquet(f, **kwargs) | ||
| return table | ||
|
|
||
|
|
||
|
|
||
| Table.fromparquet = fromparquet | ||
| Table.toparquet = toparquet | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import pandas as pd | ||
|
||
| import petl as etl | ||
|
|
||
|
|
||
| def make_sample(tmp_path): | ||
|
||
| df = pd.DataFrame([{'x': 1}, {'x': 2}, {'x': 3}]) | ||
|
||
| path = tmp_path / 'foo.parquet' | ||
| df.to_parquet(path) | ||
| return path | ||
|
|
||
|
|
||
| def test_fromparquet(tmp_path): | ||
|
||
| tbl = etl.io.fromparquet(str(make_sample(tmp_path))) | ||
|
||
| assert tbl.header() == ('x',) | ||
|
||
| assert list(tbl.values()) == [(1,), (2,), (3,)] | ||
|
||
|
|
||
|
|
||
| def test_toparquet(tmp_path): | ||
|
||
| tbl = etl.fromdicts([{'y': 10}, {'y': 20}]) | ||
|
||
| out = tmp_path / 'out.parquet' | ||
| tbl.toparquet(str(out)) | ||
| df2 = pd.read_parquet(out) | ||
|
||
| assert list(df2['y']) == [10, 20] | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,3 +8,7 @@ rinohtype | |
|
|
||
| setuptools | ||
| setuptools-scm | ||
|
|
||
| # add parquet dependencies | ||
| pandas | ||
| pyarrow | ||
Check warning
Code scanning / Ruff (reported by Codacy)
`petl.io.parquet.fromparquet` imported but unused; consider removing, adding to `__all__`, or using a redundant alias (F401)