Skip to content

feat(r): Add GDAL read to R package #485

@paleolimbot

Description

@paleolimbot

In Python we can read GDAL/OGR sources with read_pyogrio():

class PyogrioFormatSpec(ExternalFormatSpec):
"""An `ExternalFormatSpec` implementation wrapping GDAL/OGR via pyogrio"""
def __init__(self, extension=""):
self._extension = extension
self._options = {}
def with_options(self, options):
cloned = type(self)(self.extension)
cloned._options.update(options)
return cloned
@property
def extension(self) -> str:
return self._extension
def open_reader(self, args):
import pyogrio.raw
url = args.src.to_url()
if url is None:
raise ValueError(f"Can't convert {args.src} to OGR-openable object")
if url.startswith("http://") or url.startswith("https://"):
ogr_src = f"/vsicurl/{url}"
elif url.startswith("file://") and sys.platform != "win32":
ogr_src = url.removeprefix("file://")
elif url.startswith("file:///"):
ogr_src = url.removeprefix("file:///")
else:
raise ValueError(f"Can't open {url} with OGR")
if ogr_src.endswith(".zip"):
ogr_src = f"/vsizip/{ogr_src}"
if args.is_projected():
file_columns = args.file_schema.names
columns = [file_columns[i] for i in args.file_projection]
else:
columns = None
batch_size = args.batch_size if args.batch_size is not None else 0
if args.filter and args.file_schema is not None:
geometry_column_indices = args.file_schema.geometry_column_indices
file_columns = args.file_schema.names
if len(geometry_column_indices) == 1:
bbox = args.filter.bounding_box(
file_columns[geometry_column_indices[0]]
)
else:
bbox = None
else:
bbox = None
return PyogrioReaderShelter(
pyogrio.raw.ogr_open_arrow(
ogr_src, {}, columns=columns, batch_size=batch_size, bbox=bbox
),
columns,
)

This works by leveraging the ArrowArrayStream API as exposed by pyogrio so we don't need to ship a separate GDAL build.

In R it's a significant performance hit to have this read go materialize to sf ( apache/sedona#2576 ). We may need to add something to the sf package to allow the read to skip materialization; however, the sf package does implement the array stream interface at least a little: https://github.com/r-spatial/sf/blob/25700c2cb48191598bfc7495baafe4b6808398c6/R/read.R#L221-L265 , possibly enough to prototype.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions