From 06dc27c123daafdd2093406a0d6f64331e25b8c1 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 27 Jan 2024 12:05:42 -0500 Subject: [PATCH 1/2] `deps`: polars bumped from 0.36 to 0.37 - added jsonpath_lib needed by polars 0.37 - added addl polars features required to get addl sqlp capabilities - removed `horizontal_concat` feature which was removed from polars 0.37 - added notes about patched crates --- Cargo.lock | 177 ++++++++++++++++++++++++++++++++++++++++------------- Cargo.toml | 13 +++- 2 files changed, 143 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1c019ea27..cc45861d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -388,16 +388,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-format" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" -dependencies = [ - "planus", - "serde", -] - [[package]] name = "arrow-ord" version = "49.0.0" @@ -938,6 +928,28 @@ dependencies = [ "windows-targets 0.52.0", ] +[[package]] +name = "chrono-tz" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf 0.11.2", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" +dependencies = [ + "parse-zoneinfo", + "phf 0.11.2", + "phf_codegen", +] + [[package]] name = "cipher" version = "0.4.4" @@ -2147,6 +2159,12 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "hmac" version = "0.12.1" @@ -2380,6 +2398,12 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "itoap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" + [[package]] name = "jemalloc-sys" version = "0.5.4+5.3.0-patched" @@ -2446,6 +2470,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonpath_lib" +version = "0.3.0" +source = "git+https://github.com/ritchie46/jsonpath?branch=improve_compiled#24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" +dependencies = [ + "log", + "serde", + "serde_json", +] + [[package]] name = "jsonschema" version = "0.17.1" @@ -3190,6 +3224,15 @@ dependencies = [ "futures", ] +[[package]] +name = "parse-zoneinfo" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +dependencies = [ + "regex", +] + [[package]] name = "password-hash" version = "0.4.2" @@ -3280,6 +3323,16 @@ dependencies = [ "phf_shared 0.11.2", ] +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", +] + [[package]] name = "phf_generator" version = "0.10.0" @@ -3410,9 +3463,9 @@ checksum = "626dec3cac7cc0e1577a2ec3fc496277ec2baa084bebad95bb6fdbfae235f84c" [[package]] name = "polars" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938048fcda6a8e2ace6eb168bee1b415a92423ce51e418b853bf08fc40349b6b" +checksum = "e43795c49010cb851d45227caa17769e83760e21d260ba6285c563b754e1652f" dependencies = [ "getrandom", "polars-core", @@ -3426,16 +3479,17 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce68a02f698ff7787c261aea1b4c040a8fe183a8fb200e2436d7f35d95a1b86f" +checksum = "faacd21a2548fa6d50c72d6b8d4649a8e029a0f3c6c5545b7f436f0610e49b0f" dependencies = [ "ahash 0.8.7", - "arrow-format", + "atoi", "atoi_simd", "avro-schema", "bytemuck", "chrono", + "chrono-tz", "dyn-clone", "either", "ethnum", @@ -3445,9 +3499,11 @@ dependencies = [ "getrandom", "hashbrown 0.14.3", "itoa", + "itoap", "lz4", "multiversion", "num-traits", + "polars-arrow-format", "polars-error", "polars-utils", "ryu", @@ -3458,29 +3514,41 @@ dependencies = [ "zstd 0.13.0", ] +[[package]] +name = "polars-arrow-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b0ef2474af9396b19025b189d96e992311e6a47f90c53cd998b36c4c64b84c" +dependencies = [ + "planus", + "serde", +] + [[package]] name = "polars-compute" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14fbc5f141b29b656a4cec4802632e5bff10bf801c6809c6bbfbd4078a044dd" +checksum = "32d9dc87f8003ae0edeef5ad9ac92b2a345480bbe17adad64496113ae84706dd" dependencies = [ "bytemuck", "num-traits", "polars-arrow", + "polars-error", "polars-utils", "version_check", ] [[package]] name = "polars-core" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f5efe734b6cbe5f97ea769be8360df5324fade396f1f3f5ad7fe9360ca4a23" +checksum = "befd4d280a82219a01035c4f901319ceba65998c594d0c64f9a439cdee1d7777" dependencies = [ "ahash 0.8.7", "bitflags 2.4.2", "bytemuck", "chrono", + "chrono-tz", "comfy-table", "either", "hashbrown 0.14.3", @@ -3505,12 +3573,12 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6396de788f99ebfc9968e7b6f523e23000506cde4ba6dfc62ae4ce949002a886" +checksum = "50f2435b02d1ba36d8c1f6a722cad04e4c0b2705a3112c5706e6960d405d7798" dependencies = [ - "arrow-format", "avro-schema", + "polars-arrow-format", "regex", "simdutf8", "thiserror", @@ -3518,9 +3586,9 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d0458efe8946f4718fd352f230c0db5a37926bd0d2bd25af79dc24746abaaea" +checksum = "b51fba2cf014cb39c2b38353d601540fb9db643be65abb9ca8ff44b9c4c4a88e" dependencies = [ "ahash 0.8.7", "async-trait", @@ -3556,9 +3624,9 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea47d46b7a98fa683ef235ad48b783abf61734828e754096cfbdc77404fff9b3" +checksum = "973d1f40ba964e70cf0038779056a7850f649538f72d8828c21bc1a7bce312ed" dependencies = [ "ahash 0.8.7", "chrono", @@ -3577,9 +3645,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d7105b40905bb38e8fc4a7fd736594b7491baa12fad3ac492969ca221a1b5d5" +checksum = "d83343e413346f048f3a5ad07c0ea4b5d0bada701a482878213142970b0ddff8" dependencies = [ "ahash 0.8.7", "bitflags 2.4.2", @@ -3601,34 +3669,42 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e09afc456ab11e75e5dcb43e00a01c71f3a46a2781e450054acb6bb096ca78e" +checksum = "6395f5fd5e1adf016fd6403c0a493181c1a349a7a145b2687cdf50a0d630310a" dependencies = [ "ahash 0.8.7", "argminmax", + "base64", "bytemuck", + "chrono", + "chrono-tz", "either", "hashbrown 0.14.3", + "hex", "indexmap", + "jsonpath_lib", "memchr", "num-traits", "polars-arrow", "polars-compute", "polars-core", "polars-error", + "polars-json", "polars-utils", "rayon", "regex", + "serde_json", "smartstring", + "unicode-reverse", "version_check", ] [[package]] name = "polars-parquet" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba24d67b1f64ab85143033dd46fa090b13c0f74acdf91b0780c16aecf005e3d" +checksum = "b664cac41636cc9f146fba584a8e7c2790d7335a278964529fa3e9b4eae96daf" dependencies = [ "ahash 0.8.7", "async-stream", @@ -3652,9 +3728,9 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b7ead073cc3917027d77b59861a9f071db47125de9314f8907db1a0a3e4100" +checksum = "390a831b864bc57a4cb260b0595030dfb6a4260a3723cf8ca17968ee2078b8ff" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -3676,12 +3752,13 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384a175624d050c31c473ee11df9d7af5d729ae626375e522158cfb3d150acd0" +checksum = "7fb7d7527be2aa33baace9000f6772eb9df7cd57ec010a4b273435d2dc1349e8" dependencies = [ "ahash 0.8.7", "bytemuck", + "chrono-tz", "once_cell", "percent-encoding", "polars-arrow", @@ -3701,9 +3778,9 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32322f7acbb83db3e9c7697dc821be73d06238da89c817dcc8bc1549a5e9c72f" +checksum = "f4984d97aad3d0db92afe76ebcab10b5e37a1216618b5703ae0d2917ccd6168c" dependencies = [ "polars-arrow", "polars-error", @@ -3712,10 +3789,11 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f0b4c6ddffdfd0453e84bc3918572c633014d661d166654399cf93752aa95b5" +checksum = "77f62a8b8f93146ec1eb2ef340d77eeb174e8010035e449bfdd424d2b1fd944a" dependencies = [ + "hex", "polars-arrow", "polars-core", "polars-error", @@ -3729,12 +3807,13 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee2649fc96bd1b6584e0e4a4b3ca7d22ed3d117a990e63ad438ecb26f7544d0" +checksum = "6d75348a51d0c97f3b83df860ecb35a6ac6c5dafc6278cac4e1ac101d96dc753" dependencies = [ "atoi", "chrono", + "chrono-tz", "now", "once_cell", "polars-arrow", @@ -3748,9 +3827,9 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b174ca4a77ad47d7b91a0460aaae65bbf874c8bfbaaa5308675dadef3976bbda" +checksum = "38f9c955bb1e9b55d835aeb7fe4e4e8826e01abe5f0ada979ceb7d2b9af7b569" dependencies = [ "ahash 0.8.7", "bytemuck", @@ -4037,6 +4116,7 @@ dependencies = [ "itoa", "jemallocator", "jql-runner", + "jsonpath_lib", "jsonschema", "jsonxf", "log", @@ -5620,6 +5700,15 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-reverse" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bea5dacebb0d2d0a69a6700a05b59b3908bf801bf563a49bd27a1b60122962c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "unicode-segmentation" version = "1.10.1" diff --git a/Cargo.toml b/Cargo.toml index 4b1a5fd23..9cf6963eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,6 +120,7 @@ indicatif = "0.17" itertools = "0.12" itoa = "1" jemallocator = { version = "0.5", optional = true } +jsonpath_lib = { version = "0.3", optional = true } jsonschema = { version = "0.17", features = [ "resolve-file", "resolve-http", @@ -136,7 +137,7 @@ mlua = { version = "0.9", features = [ num_cpus = "1" odht = "0.3" phf = { version = "0.11", features = ["macros"], optional = true } -polars = { version = "0.36", features = [ +polars = { version = "0.37", features = [ "lazy", "streaming", "object", @@ -144,12 +145,13 @@ polars = { version = "0.36", features = [ "cross_join", "semi_anti_join", "sql", + "extract_jsonpath", + "binary_encoding", "json", "parquet", "ipc", "performant", "cse", - "horizontal_concat", "avro", ], optional = true } pyo3 = { version = "0.20", features = ["auto-initialize"], optional = true } @@ -233,10 +235,15 @@ rusqlite = { version = "0.29", features = ["bundled"] } serial_test = { version = "3.0", features = ["file_locks"] } [patch.crates-io] +# needed as dynfmt doesn't work in release mode without this dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } +# needed to get latest dependencies. Even though dependabot PRs are merged, but there are no new releases grex = { git = "https://github.com/pemistahl/grex", rev = "8f6b35cee5f911311c2e0ef6e56f333e4c896112" } +# needed to get rid of old hashbrown dependency halfbrown = { git = "https://github.com/licenser/halfbrown", rev = "7cecc29422ae2775abe35a2e430f1678b4f1aa76" } gender_guesser = { git = "https://github.com/jqnatividad/gender_guesser", branch = "bundle_namdict_txt"} +# needed for polars 0.37 +jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", branch = "improve_compiled" } [features] default = ["mimalloc"] @@ -294,7 +301,7 @@ to = ["csvs_convert"] to_parquet = ["csvs_convert/parquet"] lite = [] datapusher_plus = ["self_update"] -polars = ["dep:polars", "smartstring"] +polars = ["dep:polars", "smartstring", "jsonpath_lib"] feature_capable = [] nightly = [ "regex/unstable", From d3b4b5d3aac8d71d36fdb999aa4851749958f8dd Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 27 Jan 2024 12:06:29 -0500 Subject: [PATCH 2/2] `tests`: addl sqlp test for stuff fixed and introduced in polars 0.37 --- tests/test_sqlp.rs | 207 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 200 insertions(+), 7 deletions(-) diff --git a/tests/test_sqlp.rs b/tests/test_sqlp.rs index 8abb0bb5c..46f343a7f 100644 --- a/tests/test_sqlp.rs +++ b/tests/test_sqlp.rs @@ -491,7 +491,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["true"], svec!["true"], - svec!["true"], + svec!["false"], svec!["true"], svec!["false"], ]; @@ -523,7 +523,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["abc"], svec!["abc"], - svec!["abc"], + svec![" "], svec!["a"], svec!["b"], ]; @@ -539,7 +539,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["bc"], svec!["bc"], - svec!["bc"], + svec![" "], svec![""], svec![""], ]; @@ -555,7 +555,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["ABCDE"], svec!["ABC"], - svec!["ABC"], + svec![" ABC"], svec!["A"], svec!["B"], ]; @@ -571,7 +571,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["abcde"], svec!["abc"], - svec!["abc"], + svec![" abc"], svec!["a"], svec!["b"], ]; @@ -587,7 +587,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["5"], svec!["3"], - svec!["3"], + svec!["7"], svec!["1"], svec!["1"], ]; @@ -603,7 +603,7 @@ fn sqlp_string_functions() { svec!["scol"], svec!["5"], svec!["3"], - svec!["3"], + svec!["7"], svec!["1"], svec!["1"], ]; @@ -1180,3 +1180,196 @@ fn sqlp_sql_tsv() { assert_eq!(got, expected); } + +#[test] +fn sqlp_binary_functions() { + let wrk = Workdir::new("sqlp_sql_binary_functions"); + wrk.create("dummy.csv", vec![svec!["dummy"], svec!["0"]]); + + let mut cmd = wrk.command("sqlp"); + cmd.arg("dummy.csv") + .arg( + r#" + SELECT *, + -- bit strings + b'' AS b0, + b'1001' AS b1, + b'11101011' AS b2, + b'1111110100110010' AS b3, + -- hex strings + x'' AS x0, + x'FF' AS x1, + x'4142' AS x2, + x'DeadBeef' AS x3, + FROM dummy + "#, + ) + .args(["--format", "parquet"]); + + wrk.assert_success(&mut cmd); +} + +#[test] +fn sqlp_length_fns() { + let wrk = Workdir::new("sqlp_sql_length_fns"); + wrk.create( + "test.csv", + vec![svec!["words"], svec!["Cafe"], svec![""], svec!["東京"]], + ); + + let mut cmd = wrk.command("sqlp"); + cmd.arg("test.csv").arg( + r#" + SELECT + words, + LENGTH(words) AS n_chrs1, + CHAR_LENGTH(words) AS n_chrs2, + CHARACTER_LENGTH(words) AS n_chrs3, + OCTET_LENGTH(words) AS n_bytes, + BIT_LENGTH(words) AS n_bits + FROM test +"#, + ); + + wrk.assert_success(&mut cmd); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["words", "n_chrs1", "n_chrs2", "n_chrs3", "n_bytes", "n_bits"], + svec!["Cafe", "4", "4", "4", "4", "32"], + svec!["", "", "", "", "", ""], + svec!["東京", "2", "2", "2", "6", "48"], + ]; + + assert_eq!(got, expected); +} + +#[test] +fn sqlp_nullif_coalesce() { + let wrk = Workdir::new("sqlp_nullif_coalesce"); + wrk.create( + "test.csv", + vec![ + svec!["x", "y", "z"], + svec!["1", "5", "3"], + svec!["", "4", "4"], + svec!["2", "", ""], + svec!["3", "3", "3"], + svec!["", "", "6"], + svec!["4", "2", ""], + ], + ); + + let mut cmd = wrk.command("sqlp"); + cmd.arg("test.csv").arg( + r#" + SELECT + COALESCE(x,y,z) as "coalsc", + NULLIF(x, y) as "nullif x_y", + NULLIF(y, z) as "nullif y_z", + IFNULL(x, y) as "ifnull x_y", + IFNULL(y,-1) as "inullf y_z", + COALESCE(x, NULLIF(y,z)) as "both" + FROM test +"#, + ); + + wrk.assert_success(&mut cmd); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec![ + "coalsc", + "nullif x_y", + "nullif y_z", + "ifnull x_y", + "inullf y_z", + "both" + ], + svec!["1", "1", "5", "1", "5", "1"], + svec!["4", "", "", "4", "4", ""], + svec!["2", "2", "", "2", "-1", "2"], + svec!["3", "", "", "3", "3", "3"], + svec!["6", "", "", "", "-1", ""], + svec!["4", "4", "2", "4", "2", "4"], + ]; + + assert_eq!(got, expected); +} + +#[test] +fn sqlp_div_sign() { + let wrk = Workdir::new("sqlp_div_sign"); + wrk.create( + "test.csv", + vec![ + svec!["a", "b"], + svec!["10.0", "-100.5"], + svec!["20.0", "7.0"], + svec!["30.0", "2.5"], + svec!["40.0", ""], + svec!["50.0", "-3.14"], + ], + ); + + let mut cmd = wrk.command("sqlp"); + cmd.arg("test.csv").arg( + r#" + SELECT + a / b AS a_div_b, + a // b AS a_floordiv_b, + SIGN(b) AS b_sign, + FROM test +"#, + ); + + wrk.assert_success(&mut cmd); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["a_div_b", "a_floordiv_b", "b_sign"], + svec!["-0.09950248756218906", "-1", "-1"], + svec!["2.857142857142857", "2", "1"], + svec!["12.0", "12", "1"], + svec!["", "", ""], + svec!["-15.92356687898089", "-16", "-1"], + ]; + + assert_eq!(got, expected); +} + +#[test] +fn sqlp_string_replace() { + let wrk = Workdir::new("sqlp_string_replace"); + wrk.create( + "test.csv", + vec![ + svec!["words"], + svec!["Yemeni coffee is the best coffee"], + svec![""], + ], + ); + + let mut cmd = wrk.command("sqlp"); + cmd.arg("test.csv").arg( + r#" + SELECT + REPLACE( + REPLACE(words, 'coffee', 'tea'), + 'Yemeni', + 'English breakfast' + ) + FROM test +"#, + ); + + wrk.assert_success(&mut cmd); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["words"], + svec!["English breakfast tea is the best tea"], + ]; + + assert_eq!(got, expected); +}