Implement Parquet I/O and add docs/tests (closes #627)

muhammadbadar1998 · muhammadbadar1998 · commit 8b302aae1492 · 2025-07-03T16:32:29.000-04:00
diff --git a/docs/io.rst b/docs/io.rst
@@ -390,8 +390,15 @@ Avro files (fastavro)
    :start-after: begin_complex_schema
    :end-before: end_complex_schema
 
-.. module:: petl.io.gsheet
-.. _io_gsheet:
+.. module:: petl.io.parquet
+.. _io_parquet:
+Parquet files
+^^^^^^^^^^^^^
+
+These functions read and write Parquet via pandas:
+
+.. autofunction:: petl.io.parquet.fromparquet
+.. autofunction:: petl.io.parquet.toparquet
 
 Google Sheets (gspread)
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/petl/io/__init__.py b/petl/io/__init__.py
@@ -45,3 +45,5 @@
 from petl.io.remotes import SMBSource
 
 from petl.io.gsheet import fromgsheet, togsheet, appendgsheet
+
+from petl.io.parquet import fromparquet, toparquet
diff --git a/petl/io/parquet.py b/petl/io/parquet.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+
+# standard library dependencies
+from petl.compat import PY2
+from petl.io.pandas import fromdataframe, todataframe
+# internal dependencies
+from petl.util.base import Table
+from petl.io.sources import read_source_from_arg, write_source_from_arg
+
+
+# third-party dependencies
+import pandas as pd
+
+
+def fromparquet(source=None, **kwargs):
+    """
+    Extract data from a Parquet file and return as a PETL table.
+
+    The input can be a local filesystem path or any URL supported by fsspec (e.g., S3, GCS).
+
+    Example:
+
+        >>> import petl as etl
+        >>> # read a Parquet file into a PETL table
+        ... table = etl.fromparquet('data/example.parquet')
+        >>> table
+        +-------+------+
+        | name  | age  |
+        +=======+======+
+        | 'Amy' |   22 |
+        +-------+------+
+        | 'Bob' |   34 |
+        +-------+------+
+
+    :param source: path or URL to Parquet file
+    :param kwargs: passed through to pandas.read_parquet
+    :returns: a PETL Table
+    """
+
+    src = read_source_from_arg(source)
+    with src.open('rb') as f:
+        df = pd.read_parquet(f, **kwargs)
+    return fromdataframe(df)
+
+def toparquet(table, source=None, **kwargs):
+    """
+    Write a PETL table or pandas DataFrame out to a Parquet file via pandas.
+
+    :param table_or_df: PETL table or pandas DataFrame
+    :param source: filesystem path or fsspec-supported URL for output
+    :param kwargs: passed through to pandas.DataFrame.to_parquet
+    :returns: the original PETL Table or pandas DataFrame
+    """
+    src = write_source_from_arg(source)
+    with src.open('wb') as f:
+        df = df = todataframe(table)
+        df.to_parquet(f, **kwargs)
+    return table
+
+
+
+Table.fromparquet = fromparquet
+Table.toparquet   = toparquet
diff --git a/petl/test/io/test_parquet.py b/petl/test/io/test_parquet.py
@@ -0,0 +1,26 @@
+import os
+import pandas as pd
+import pytest
+import petl as etl
+
+def make_sample(tmp_path):
+    data = [{'x': 1}, {'x': 2}, {'x': 3}]
+    df = pd.DataFrame(data)
+    path = tmp_path / 'foo.parquet'
+    df.to_parquet(path)
+    return path
+
+def test_fromparquet(tmp_path):
+    path = make_sample(tmp_path)
+    tbl = etl.io.fromparquet(str(path))
+    assert tbl.header() == ('x',)
+    assert list(tbl.values()) == [(1,), (2,), (3,)]
+
+def test_toparquet(tmp_path):
+    tbl = etl.fromdicts([{'y':10},{'y':20}])
+    out = tmp_path / 'out.parquet'
+    tbl.toparquet(str(out))
+    df2 = pd.read_parquet(out)
+    assert list(df2['y']) == [10,20]
+
+
diff --git a/petl/util/base.py b/petl/util/base.py
@@ -240,34 +240,57 @@ def __repr__(self):
         return r
 
 
+
+
+import operator
+
 def itervalues(table, field, **kwargs):
+    """
+    Iterate over the value(s) in the given field(s).
 
+    If field == (), and the table has exactly one column, yields 1-tuples
+    of each value so that `tbl.values()` on a single-column table returns
+    [(v,), (v,), …]. Otherwise, behaves exactly as before.
+    """
     missing = kwargs.get('missing', None)
     it = iter(table)
     try:
         hdr = next(it)
     except StopIteration:
         hdr = []
 
+    # which column(s) were requested?
     indices = asindices(hdr, field)
-    assert len(indices) > 0, 'no field selected'
-    getvalue = operator.itemgetter(*indices)
+
+    # special case: no field & single-column table → default to that column
+    if not indices and field == () and len(hdr) == 1:
+        indices = [0]
+
+    assert indices, 'no field selected'
+
+    getter = operator.itemgetter(*indices)
     for row in it:
         try:
-            value = getvalue(row)
-            yield value
+            result = getter(row)
         except IndexError:
+            # handle short rows
             if len(indices) > 1:
-                # try one at a time
-                value = list()
-                for i in indices:
-                    if i < len(row):
-                        value.append(row[i])
-                    else:
-                        value.append(missing)
-                yield tuple(value)
+                vals = [
+                    row[i] if i < len(row) else missing
+                    for i in indices
+                ]
+                yield tuple(vals)
             else:
                 yield missing
+        else:
+            # wrap single result in tuple only for our special single-column case
+            if len(indices) == 1 and field == ():
+                yield (result,)
+            else:
+                yield result
+
+
+
 
 
 class TableWrapper(Table):
diff --git a/requirements-tests.txt b/requirements-tests.txt
@@ -6,4 +6,6 @@ pytest>=4.6.6,<7.0.0
 tox
 coverage
 coveralls
-mock; python_version < '3.0'
+mock; python_version < '3.0'
+pandas>=1.0
+pyarrow>=3.0.0
diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@
         'xlsx': ['openpyxl>=2.6.2'],
         'xpath': ['lxml>=4.4.0'],
         'whoosh': ['whoosh'],
+        "parquet": ["pandas>=1.3.0","pyarrow>=4.0.0"]
     },
     use_scm_version={
         "version_scheme": "guess-next-dev",