Skip to content

Commit

Permalink
Merge pull request #2315 from jqnatividad/polars_auto_decompress
Browse files Browse the repository at this point in the history
`sqlp`: auto-decompression of gz, zstd & zlib compressed csv files with `read_csv` table function
  • Loading branch information
jqnatividad authored Dec 1, 2024
2 parents f7591be + 4aa9545 commit 0bf3734
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 2 deletions.
26 changes: 24 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ polars = { version = "0.44.2", features = [
# "cloud",
"coalesce",
"cross_join",
"decompress-fast",
# "dtype-full",
"extract_jsonpath",
"ipc",
Expand Down
Binary file added resources/test/boston311-100.csv.gz
Binary file not shown.
Binary file added resources/test/boston311-100.csv.zlib
Binary file not shown.
Binary file added resources/test/boston311-100.csv.zst
Binary file not shown.
6 changes: 6 additions & 0 deletions src/cmd/sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ Example queries:
# configurability (i.e. limited to comma delimiter, no CSV comments, etc.).
qsv sqlp SKIP_INPUT "select * from read_csv('data.csv') order by col1 desc limit 100"
# note that you can also use read_csv() to read compressed files directly
# gzip, zstd and zlib automatic decompression are supported
qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.gz')"
qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.zst')"
qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.zlib')"
Note that sqlp will automatically use this "fast path" read_csv() optimization when there
is only one input CSV file, no CSV parsing options are used, its not a SQL script and the
`--no-optimizations` flag is not set.
Expand Down
107 changes: 107 additions & 0 deletions tests/test_sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,32 @@ fn sqlp_boston311_cte() {
assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_cte_gz() {
let wrk = Workdir::new("sqlp_boston311_cte_gz");
let test_file = wrk.load_test_file("boston311-100.csv.gz");

let mut cmd = wrk.command("sqlp");
cmd.arg(&test_file).arg(
r#"with boston311_roxbury as (select * from read_csv('boston311-100.csv.gz') where neighborhood = 'Roxbury')
select ward,count(*) as cnt from boston311_roxbury group by ward order by cnt desc, ward asc;"#,
);

wrk.assert_success(&mut cmd);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["ward", "cnt"],
svec!["Ward 11", "2"],
svec!["Ward 13", "2"],
svec!["Ward 8", "2"],
svec!["14", "1"],
svec!["Ward 12", "1"],
];

assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_case_expression() {
let wrk = Workdir::new("sqlp_boston311_case_expression");
Expand Down Expand Up @@ -1185,6 +1211,46 @@ fn sqlp_boston311_case_expression() {
assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_case_expression_zlib() {
let wrk = Workdir::new("sqlp_boston311_case_expression_zlib");
let test_file = wrk.load_test_file("boston311-100.csv.zlib");

let mut cmd = wrk.command("sqlp");
cmd.arg(&test_file).arg(
r#"SELECT case_enquiry_id,
CASE closed_dt is null and case_title ~* 'graffiti'
WHEN True THEN 'Yes'
WHEN False THEN 'No'
ELSE 'N/A'
END as graffiti_related
from read_csv('boston311-100.csv.zlib')
where case_status = 'Open'"#,
);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["case_enquiry_id", "graffiti_related"],
svec!["101004143000", "No"],
svec!["101004155594", "No"],
svec!["101004154423", "No"],
svec!["101004141848", "No"],
svec!["101004113313", "No"],
svec!["101004113751", "Yes"],
svec!["101004113902", "Yes"],
svec!["101004113473", "No"],
svec!["101004113604", "No"],
svec!["101004114154", "Yes"],
svec!["101004114383", "No"],
svec!["101004114795", "Yes"],
svec!["101004118346", "Yes"],
svec!["101004115302", "No"],
svec!["101004115066", "No"],
];

assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_case_expression_streaming() {
let wrk = Workdir::new("sqlp_boston311_case_expression_streaming");
Expand Down Expand Up @@ -1268,6 +1334,47 @@ fn sqlp_boston311_case() {
assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_case_zstd() {
let wrk = Workdir::new("sqlp_boston311_case_zst");
let test_file = wrk.load_test_file("boston311-100.csv.zst");

let mut cmd = wrk.command("sqlp");
cmd.arg(&test_file).arg(
r#"SELECT case_enquiry_id,
CASE
WHEN case_title ~* 'graffiti' THEN 'Graffitti'
WHEN case_title ~* 'vehicle' THEN 'Vehicle'
WHEN case_title ~* 'sidewalk' THEN 'Sidewalk'
ELSE 'Something else'
END as topic
from read_csv('boston311-100.csv.zst')
where case_status = 'Open'"#,
);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["case_enquiry_id", "topic"],
svec!["101004143000", "Something else"],
svec!["101004155594", "Something else"],
svec!["101004154423", "Sidewalk"],
svec!["101004141848", "Something else"],
svec!["101004113313", "Something else"],
svec!["101004113751", "Graffitti"],
svec!["101004113902", "Graffitti"],
svec!["101004113473", "Sidewalk"],
svec!["101004113604", "Something else"],
svec!["101004114154", "Graffitti"],
svec!["101004114383", "Something else"],
svec!["101004114795", "Graffitti"],
svec!["101004118346", "Graffitti"],
svec!["101004115302", "Vehicle"],
svec!["101004115066", "Sidewalk"],
];

assert_eq!(got, expected);
}

#[test]
fn sqlp_literal_pattern_match() {
let wrk = Workdir::new("sqlp_literal_pattern_match");
Expand Down

0 comments on commit 0bf3734

Please sign in to comment.