diff --git a/Cargo.lock b/Cargo.lock index cf9a20002..e524afeed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1317,6 +1317,15 @@ dependencies = [ "error-code", ] +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "codepage" version = "0.1.2" @@ -2214,9 +2223,9 @@ dependencies = [ [[package]] name = "event-listener-strategy" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" dependencies = [ "event-listener", "pin-project-lite", @@ -2338,6 +2347,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", + "libz-ng-sys", "miniz_oxide", ] @@ -3571,6 +3581,16 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libz-ng-sys" +version = "1.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4436751a01da56f1277f323c80d584ffad94a3d14aecd959dd0dff75aa73a438" +dependencies = [ + "cmake", + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -4707,6 +4727,7 @@ dependencies = [ "chrono", "chrono-tz 0.8.6", "fast-float2", + "flate2", "fs4", "futures", "glob", @@ -4738,6 +4759,7 @@ dependencies = [ "tokio", "tokio-util", "url", + "zstd", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b5d96b441..a6cbe7d44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -173,6 +173,7 @@ polars = { version = "0.44.2", features = [ # "cloud", "coalesce", "cross_join", + "decompress-fast", # "dtype-full", "extract_jsonpath", "ipc", diff --git a/resources/test/boston311-100.csv.gz b/resources/test/boston311-100.csv.gz new file mode 100644 index 000000000..3e0d29469 Binary files /dev/null and b/resources/test/boston311-100.csv.gz differ diff --git a/resources/test/boston311-100.csv.zlib b/resources/test/boston311-100.csv.zlib new file mode 100644 index 000000000..6ba3e04ae Binary files /dev/null and b/resources/test/boston311-100.csv.zlib differ diff --git a/resources/test/boston311-100.csv.zst b/resources/test/boston311-100.csv.zst new file mode 100644 index 000000000..55181c65f Binary files /dev/null and b/resources/test/boston311-100.csv.zst differ diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index 3e4a5abbf..040f04379 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -123,6 +123,12 @@ Example queries: # configurability (i.e. limited to comma delimiter, no CSV comments, etc.). qsv sqlp SKIP_INPUT "select * from read_csv('data.csv') order by col1 desc limit 100" + # note that you can also use read_csv() to read compressed files directly + # gzip, zstd and zlib automatic decompression are supported + qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.gz')" + qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.zst')" + qsv sqlp SKIP_INPUT "select * from read_csv('data.csv.zlib')" + Note that sqlp will automatically use this "fast path" read_csv() optimization when there is only one input CSV file, no CSV parsing options are used, its not a SQL script and the `--no-optimizations` flag is not set. diff --git a/tests/test_sqlp.rs b/tests/test_sqlp.rs index fd2022950..495005e7a 100644 --- a/tests/test_sqlp.rs +++ b/tests/test_sqlp.rs @@ -1145,6 +1145,32 @@ fn sqlp_boston311_cte() { assert_eq!(got, expected); } +#[test] +fn sqlp_boston311_cte_gz() { + let wrk = Workdir::new("sqlp_boston311_cte_gz"); + let test_file = wrk.load_test_file("boston311-100.csv.gz"); + + let mut cmd = wrk.command("sqlp"); + cmd.arg(&test_file).arg( + r#"with boston311_roxbury as (select * from read_csv('boston311-100.csv.gz') where neighborhood = 'Roxbury') + select ward,count(*) as cnt from boston311_roxbury group by ward order by cnt desc, ward asc;"#, + ); + + wrk.assert_success(&mut cmd); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["ward", "cnt"], + svec!["Ward 11", "2"], + svec!["Ward 13", "2"], + svec!["Ward 8", "2"], + svec!["14", "1"], + svec!["Ward 12", "1"], + ]; + + assert_eq!(got, expected); +} + #[test] fn sqlp_boston311_case_expression() { let wrk = Workdir::new("sqlp_boston311_case_expression"); @@ -1185,6 +1211,46 @@ fn sqlp_boston311_case_expression() { assert_eq!(got, expected); } +#[test] +fn sqlp_boston311_case_expression_zlib() { + let wrk = Workdir::new("sqlp_boston311_case_expression_zlib"); + let test_file = wrk.load_test_file("boston311-100.csv.zlib"); + + let mut cmd = wrk.command("sqlp"); + cmd.arg(&test_file).arg( + r#"SELECT case_enquiry_id, + CASE closed_dt is null and case_title ~* 'graffiti' + WHEN True THEN 'Yes' + WHEN False THEN 'No' + ELSE 'N/A' + END as graffiti_related + from read_csv('boston311-100.csv.zlib') + where case_status = 'Open'"#, + ); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["case_enquiry_id", "graffiti_related"], + svec!["101004143000", "No"], + svec!["101004155594", "No"], + svec!["101004154423", "No"], + svec!["101004141848", "No"], + svec!["101004113313", "No"], + svec!["101004113751", "Yes"], + svec!["101004113902", "Yes"], + svec!["101004113473", "No"], + svec!["101004113604", "No"], + svec!["101004114154", "Yes"], + svec!["101004114383", "No"], + svec!["101004114795", "Yes"], + svec!["101004118346", "Yes"], + svec!["101004115302", "No"], + svec!["101004115066", "No"], + ]; + + assert_eq!(got, expected); +} + #[test] fn sqlp_boston311_case_expression_streaming() { let wrk = Workdir::new("sqlp_boston311_case_expression_streaming"); @@ -1268,6 +1334,47 @@ fn sqlp_boston311_case() { assert_eq!(got, expected); } +#[test] +fn sqlp_boston311_case_zstd() { + let wrk = Workdir::new("sqlp_boston311_case_zst"); + let test_file = wrk.load_test_file("boston311-100.csv.zst"); + + let mut cmd = wrk.command("sqlp"); + cmd.arg(&test_file).arg( + r#"SELECT case_enquiry_id, + CASE + WHEN case_title ~* 'graffiti' THEN 'Graffitti' + WHEN case_title ~* 'vehicle' THEN 'Vehicle' + WHEN case_title ~* 'sidewalk' THEN 'Sidewalk' + ELSE 'Something else' + END as topic + from read_csv('boston311-100.csv.zst') + where case_status = 'Open'"#, + ); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["case_enquiry_id", "topic"], + svec!["101004143000", "Something else"], + svec!["101004155594", "Something else"], + svec!["101004154423", "Sidewalk"], + svec!["101004141848", "Something else"], + svec!["101004113313", "Something else"], + svec!["101004113751", "Graffitti"], + svec!["101004113902", "Graffitti"], + svec!["101004113473", "Sidewalk"], + svec!["101004113604", "Something else"], + svec!["101004114154", "Graffitti"], + svec!["101004114383", "Something else"], + svec!["101004114795", "Graffitti"], + svec!["101004118346", "Graffitti"], + svec!["101004115302", "Vehicle"], + svec!["101004115066", "Sidewalk"], + ]; + + assert_eq!(got, expected); +} + #[test] fn sqlp_literal_pattern_match() { let wrk = Workdir::new("sqlp_literal_pattern_match");