Skip to content

Commit f43271f

Browse files
committed
Adding options to ScanParquetOptions
1 parent 2d2ec5e commit f43271f

File tree

3 files changed

+52
-32
lines changed

3 files changed

+52
-32
lines changed

polars/io.ts

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { isPath } from "./utils";
55
import { type LazyDataFrame, _LazyDataFrame } from "./lazy/dataframe";
66
import { type Readable, Stream } from "stream";
77
import { concat } from "./functions";
8+
import type { ScanParquetOptions, RowCount } from "./types";
89

910
export interface ReadCsvOptions {
1011
inferSchemaLength: number | null;
@@ -31,7 +32,7 @@ export interface ReadCsvOptions {
3132
skipRows: number;
3233
tryParseDates: boolean;
3334
skipRowsAfterHeader: number;
34-
rowCount: any;
35+
rowCount: RowCount;
3536
raiseIfEmpty: boolean;
3637
truncateRaggedLines: boolean;
3738
missingIsNull: boolean;
@@ -470,23 +471,6 @@ export function readAvro(pathOrBody, options = {}) {
470471
throw new Error("must supply either a path or body");
471472
}
472473

473-
interface RowCount {
474-
name: string;
475-
offset: string;
476-
}
477-
478-
interface ScanParquetOptions {
479-
nRows?: number;
480-
cache?: boolean;
481-
parallel?: "auto" | "columns" | "row_groups" | "none";
482-
rowCount?: RowCount;
483-
rechunk?: boolean;
484-
lowMemory?: boolean;
485-
useStatistics?: boolean;
486-
cloudOptions?: Map<string, string>;
487-
retries?: number;
488-
}
489-
490474
/**
491475
* Lazily read from a local or cloud-hosted parquet file (or files).
492476
@@ -503,6 +487,10 @@ interface ScanParquetOptions {
503487
This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
504488
@param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading.
505489
@param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads.
490+
@param options.glob - Expand path given via globbing rules.
491+
@param options.hiveSchema - The column names and data types of the columns by which the data is partitioned.
492+
If set to `None` (default), the schema of the Hive partitions is inferred.
493+
@param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types.
506494
@param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
507495
@param options.lowMemory - Reduce memory pressure at the expense of performance.
508496
@param options.cache - Cache the result after reading.
@@ -518,6 +506,7 @@ interface ScanParquetOptions {
518506
519507
If `storage_options` is not provided, Polars will try to infer the information from environment variables.
520508
@param retries - Number of retries if accessing a cloud instance fails.
509+
@param includeFilePaths - Include the path of the source file(s) as a column with this name.
521510
*/
522511
export function scanParquet(source: string, options: ScanParquetOptions = {}) {
523512
const defaultOptions = { parallel: "auto" };

polars/types.ts

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,21 @@ export interface ReadParquetOptions {
141141
* Options for {@link scanParquet}
142142
*/
143143
export interface ScanParquetOptions {
144-
columns?: string[] | number[];
145-
numRows?: number;
146-
parallel?: "auto" | "columns" | "row_groups" | "none";
147-
rowCount?: RowCount;
144+
nRows?: number;
145+
rowIndexName?: string;
146+
rowIndexOffset?: number;
148147
cache?: boolean;
148+
parallel?: "auto" | "columns" | "row_groups" | "none";
149+
glob?: boolean;
150+
hivePartitioning?: boolean;
151+
hiveSchema?: unknown;
152+
tryParseHiveDates?: boolean;
149153
rechunk?: boolean;
154+
lowMemory?: boolean;
155+
useStatistics?: boolean;
156+
cloudOptions?: unknown;
157+
retries?: number;
158+
includeFilePaths?: string;
150159
}
151160

152161
/**
@@ -156,7 +165,7 @@ export interface RowCount {
156165
/** name of column */
157166
name: string;
158167
/** offset */
159-
offset: string;
168+
offset: number;
160169
}
161170

162171
/**

src/lazy/dataframe.rs

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -713,22 +713,38 @@ pub fn scan_csv(path: String, options: ScanCsvOptions) -> napi::Result<JsLazyFra
713713
#[napi(object)]
714714
pub struct ScanParquetOptions {
715715
pub n_rows: Option<i64>,
716+
pub row_index_name: Option<String>,
717+
pub row_index_offset: Option<u32>,
716718
pub cache: Option<bool>,
717719
pub parallel: Wrap<ParallelStrategy>,
718-
pub row_count: Option<JsRowCount>,
720+
pub glob: Option<bool>,
721+
pub hive_partitioning: Option<bool>,
722+
pub hive_schema: Option<Wrap<Schema>>,
723+
pub try_parse_hive_dates: Option<bool>,
719724
pub rechunk: Option<bool>,
720725
pub low_memory: Option<bool>,
721726
pub use_statistics: Option<bool>,
722727
pub cloud_options: Option<HashMap<String, String>>,
723728
pub retries: Option<i64>,
729+
pub include_file_paths: Option<String>,
724730
}
725731

726732
#[napi(catch_unwind)]
727733
pub fn scan_parquet(path: String, options: ScanParquetOptions) -> napi::Result<JsLazyFrame> {
728734
let n_rows = options.n_rows.map(|i| i as usize);
729735
let cache = options.cache.unwrap_or(true);
736+
let glob = options.glob.unwrap_or(true);
730737
let parallel = options.parallel;
731-
let row_index: Option<RowIndex> = options.row_count.map(|rc| rc.into());
738+
739+
let row_index: Option<RowIndex> = if let Some(idn) = options.row_index_name {
740+
Some(RowIndex {
741+
name: idn.into(),
742+
offset: options.row_index_offset.unwrap_or(0)
743+
})
744+
} else {
745+
None
746+
};
747+
732748
let rechunk = options.rechunk.unwrap_or(false);
733749
let low_memory = options.low_memory.unwrap_or(false);
734750
let use_statistics = options.use_statistics.unwrap_or(false);
@@ -751,6 +767,16 @@ pub fn scan_parquet(path: String, options: ScanParquetOptions) -> napi::Result<J
751767
});
752768
}
753769

770+
let hive_schema = options.hive_schema.map(|s| Arc::new(s.0));
771+
let hive_options = HiveOptions {
772+
enabled: options.hive_partitioning,
773+
hive_start_idx: 0,
774+
schema: hive_schema,
775+
try_parse_dates: options.try_parse_hive_dates.unwrap_or(true),
776+
};
777+
778+
let include_file_paths = options.include_file_paths;
779+
754780
let args = ScanArgsParquet {
755781
n_rows,
756782
cache,
@@ -760,13 +786,9 @@ pub fn scan_parquet(path: String, options: ScanParquetOptions) -> napi::Result<J
760786
low_memory,
761787
cloud_options,
762788
use_statistics,
763-
// TODO: Support Hive partitioning.
764-
hive_options: HiveOptions {
765-
enabled: Some(false),
766-
..Default::default()
767-
},
768-
glob: true,
769-
include_file_paths: None
789+
hive_options,
790+
glob,
791+
include_file_paths: include_file_paths.map(Arc::from),
770792
};
771793
let lf = LazyFrame::scan_parquet(path, args).map_err(JsPolarsErr::from)?;
772794
Ok(lf.into())

0 commit comments

Comments
 (0)