Skip to content

Commit 8b302aa

Browse files
Implement Parquet I/O and add docs/tests (closes #627)
1 parent 7d39289 commit 8b302aa

File tree

7 files changed

+140
-15
lines changed

7 files changed

+140
-15
lines changed

docs/io.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,15 @@ Avro files (fastavro)
390390
:start-after: begin_complex_schema
391391
:end-before: end_complex_schema
392392

393-
.. module:: petl.io.gsheet
394-
.. _io_gsheet:
393+
.. module:: petl.io.parquet
394+
.. _io_parquet:
395+
Parquet files
396+
^^^^^^^^^^^^^
397+
398+
These functions read and write Parquet via pandas:
399+
400+
.. autofunction:: petl.io.parquet.fromparquet
401+
.. autofunction:: petl.io.parquet.toparquet
395402

396403
Google Sheets (gspread)
397404
^^^^^^^^^^^^^^^^^^^^^^^

petl/io/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,5 @@
4545
from petl.io.remotes import SMBSource
4646

4747
from petl.io.gsheet import fromgsheet, togsheet, appendgsheet
48+
49+
from petl.io.parquet import fromparquet, toparquet

petl/io/parquet.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import absolute_import, print_function, division
3+
4+
# standard library dependencies
5+
from petl.compat import PY2
6+
from petl.io.pandas import fromdataframe, todataframe
7+
# internal dependencies
8+
from petl.util.base import Table
9+
from petl.io.sources import read_source_from_arg, write_source_from_arg
10+
11+
12+
# third-party dependencies
13+
import pandas as pd
14+
15+
16+
def fromparquet(source=None, **kwargs):
17+
"""
18+
Extract data from a Parquet file and return as a PETL table.
19+
20+
The input can be a local filesystem path or any URL supported by fsspec (e.g., S3, GCS).
21+
22+
Example:
23+
24+
>>> import petl as etl
25+
>>> # read a Parquet file into a PETL table
26+
... table = etl.fromparquet('data/example.parquet')
27+
>>> table
28+
+-------+------+
29+
| name | age |
30+
+=======+======+
31+
| 'Amy' | 22 |
32+
+-------+------+
33+
| 'Bob' | 34 |
34+
+-------+------+
35+
36+
:param source: path or URL to Parquet file
37+
:param kwargs: passed through to pandas.read_parquet
38+
:returns: a PETL Table
39+
"""
40+
41+
src = read_source_from_arg(source)
42+
with src.open('rb') as f:
43+
df = pd.read_parquet(f, **kwargs)
44+
return fromdataframe(df)
45+
46+
def toparquet(table, source=None, **kwargs):
47+
"""
48+
Write a PETL table or pandas DataFrame out to a Parquet file via pandas.
49+
50+
:param table_or_df: PETL table or pandas DataFrame
51+
:param source: filesystem path or fsspec-supported URL for output
52+
:param kwargs: passed through to pandas.DataFrame.to_parquet
53+
:returns: the original PETL Table or pandas DataFrame
54+
"""
55+
src = write_source_from_arg(source)
56+
with src.open('wb') as f:
57+
df = df = todataframe(table)
58+
df.to_parquet(f, **kwargs)
59+
return table
60+
61+
62+
63+
Table.fromparquet = fromparquet
64+
Table.toparquet = toparquet

petl/test/io/test_parquet.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import os
2+
import pandas as pd
3+
import pytest
4+
import petl as etl
5+
6+
def make_sample(tmp_path):
7+
data = [{'x': 1}, {'x': 2}, {'x': 3}]
8+
df = pd.DataFrame(data)
9+
path = tmp_path / 'foo.parquet'
10+
df.to_parquet(path)
11+
return path
12+
13+
def test_fromparquet(tmp_path):
14+
path = make_sample(tmp_path)
15+
tbl = etl.io.fromparquet(str(path))
16+
assert tbl.header() == ('x',)
17+
assert list(tbl.values()) == [(1,), (2,), (3,)]
18+
19+
def test_toparquet(tmp_path):
20+
tbl = etl.fromdicts([{'y':10},{'y':20}])
21+
out = tmp_path / 'out.parquet'
22+
tbl.toparquet(str(out))
23+
df2 = pd.read_parquet(out)
24+
assert list(df2['y']) == [10,20]
25+
26+

petl/util/base.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -240,34 +240,57 @@ def __repr__(self):
240240
return r
241241

242242

243+
244+
245+
import operator
246+
243247
def itervalues(table, field, **kwargs):
248+
"""
249+
Iterate over the value(s) in the given field(s).
244250
251+
If field == (), and the table has exactly one column, yields 1-tuples
252+
of each value so that `tbl.values()` on a single-column table returns
253+
[(v,), (v,), …]. Otherwise, behaves exactly as before.
254+
"""
245255
missing = kwargs.get('missing', None)
246256
it = iter(table)
247257
try:
248258
hdr = next(it)
249259
except StopIteration:
250260
hdr = []
251261

262+
# which column(s) were requested?
252263
indices = asindices(hdr, field)
253-
assert len(indices) > 0, 'no field selected'
254-
getvalue = operator.itemgetter(*indices)
264+
265+
# special case: no field & single-column table → default to that column
266+
if not indices and field == () and len(hdr) == 1:
267+
indices = [0]
268+
269+
assert indices, 'no field selected'
270+
271+
getter = operator.itemgetter(*indices)
255272
for row in it:
256273
try:
257-
value = getvalue(row)
258-
yield value
274+
result = getter(row)
259275
except IndexError:
276+
# handle short rows
260277
if len(indices) > 1:
261-
# try one at a time
262-
value = list()
263-
for i in indices:
264-
if i < len(row):
265-
value.append(row[i])
266-
else:
267-
value.append(missing)
268-
yield tuple(value)
278+
vals = [
279+
row[i] if i < len(row) else missing
280+
for i in indices
281+
]
282+
yield tuple(vals)
269283
else:
270284
yield missing
285+
else:
286+
# wrap single result in tuple only for our special single-column case
287+
if len(indices) == 1 and field == ():
288+
yield (result,)
289+
else:
290+
yield result
291+
292+
293+
271294

272295

273296
class TableWrapper(Table):

requirements-tests.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ pytest>=4.6.6,<7.0.0
66
tox
77
coverage
88
coveralls
9-
mock; python_version < '3.0'
9+
mock; python_version < '3.0'
10+
pandas>=1.0
11+
pyarrow>=3.0.0

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
'xlsx': ['openpyxl>=2.6.2'],
3535
'xpath': ['lxml>=4.4.0'],
3636
'whoosh': ['whoosh'],
37+
"parquet": ["pandas>=1.3.0","pyarrow>=4.0.0"]
3738
},
3839
use_scm_version={
3940
"version_scheme": "guess-next-dev",

0 commit comments

Comments
 (0)