From c271e6f7362b0b1497db5c029a2bcf587b5b8b6f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 08:07:18 -0500 Subject: [PATCH 1/6] feat: add geometric mean and harmonic mean - also short-circuit modes calc if cardinality==rowcount (no modes) --- src/cmd/stats.rs | 74 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 1cbd6a83a..3b3f8d725 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -448,6 +448,8 @@ const STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ JsonTypes::Float, //avg_length JsonTypes::Float, //mean JsonTypes::Float, //sem + JsonTypes::Float, //geometric_mean + JsonTypes::Float, //harmonic_mean JsonTypes::Float, //stddev JsonTypes::Float, //variance JsonTypes::Float, //cv @@ -489,7 +491,7 @@ const MS_IN_DAY_INT: i64 = 86_400_000; const DAY_DECIMAL_PLACES: u32 = 5; // maximum number of output columns -const MAX_STAT_COLUMNS: usize = 37; +const MAX_STAT_COLUMNS: usize = 39; // maximum number of antimodes to display const MAX_ANTIMODES: usize = 10; @@ -1187,6 +1189,8 @@ impl Args { "avg_length", "mean", "sem", + "geometric_mean", + "harmonic_mean", "stddev", "variance", "cv", @@ -1539,6 +1543,8 @@ impl Stats { minmax_range_sortorder_pieces.extend_from_slice(&[empty(), empty(), empty(), empty()]); } + let record_count = *RECORD_COUNT.get().unwrap_or(&1); + // modes/antimodes & cardinality // we do this second because we can use the sort order with cardinality, to skip sorting // if its not required. This makes not only cardinality computation faster, it also makes @@ -1562,19 +1568,24 @@ impl Stats { } if self.which.mode { // mode/s - let (modes_result, modes_count, mode_occurrences) = v.modes(); - let modes_list = modes_result - .iter() - .map(|c| String::from_utf8_lossy(c)) - .join(","); - mc_pieces.extend_from_slice(&[ - modes_list, - modes_count.to_string(), - mode_occurrences.to_string(), - ]); + if cardinality == record_count { + // all values unique, short-circuit modes calculation as there is none + mc_pieces.extend_from_slice(&[empty(), "0".to_string(), "0".to_string()]); + } else { + let (modes_result, modes_count, mode_occurrences) = v.modes(); + let modes_list = modes_result + .iter() + .map(|c| String::from_utf8_lossy(c)) + .join(","); + mc_pieces.extend_from_slice(&[ + modes_list, + modes_count.to_string(), + mode_occurrences.to_string(), + ]); + } // antimode/s - if mode_occurrences == 0 { + if cardinality == record_count { // all the values are unique // so instead of returning everything, just say *ALL mc_pieces.extend_from_slice(&[ @@ -1680,10 +1691,7 @@ impl Stats { // so we can compute avg_length pieces.push(itoa::Buffer::new().format(stotlen).to_owned()); #[allow(clippy::cast_precision_loss)] - pieces.push(util::round_num( - stotlen as f64 / *RECORD_COUNT.get().unwrap_or(&1) as f64, - 4, - )); + pieces.push(util::round_num(stotlen as f64 / record_count as f64, 4)); } else { // however, we saturated the sum, it means we had an overflow // so we return OVERFLOW_STRING for sum and avg length @@ -1699,19 +1707,31 @@ impl Stats { pieces.extend_from_slice(&[empty(), empty(), empty(), empty()]); } - // mean, sem, stddev, variance & cv + // mean, sem, geometric_mean, harmonic_mean, stddev, variance & cv if typ == TString || typ == TNull { - pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[ + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + ]); } else if let Some(ref v) = self.online { let std_dev = v.stddev(); #[allow(clippy::cast_precision_loss)] let sem = std_dev / (v.len() as f64).sqrt(); let mean = v.mean(); let cv = (std_dev / mean) * 100_f64; + let geometric_mean = v.geometric_mean(); + let harmonic_mean = v.harmonic_mean(); if self.typ == TFloat || self.typ == TInteger { pieces.extend_from_slice(&[ util::round_num(mean, round_places), util::round_num(sem, round_places), + util::round_num(geometric_mean, round_places), + util::round_num(harmonic_mean, round_places), util::round_num(std_dev, round_places), util::round_num(v.variance(), round_places), util::round_num(cv, round_places), @@ -1726,6 +1746,14 @@ impl Stats { sem / MS_IN_DAY, u32::max(round_places, DAY_DECIMAL_PLACES), )); + pieces.push(util::round_num( + geometric_mean / MS_IN_DAY, + u32::max(round_places, DAY_DECIMAL_PLACES), + )); + pieces.push(util::round_num( + harmonic_mean / MS_IN_DAY, + u32::max(round_places, DAY_DECIMAL_PLACES), + )); pieces.push(util::round_num( std_dev / MS_IN_DAY, u32::max(round_places, DAY_DECIMAL_PLACES), @@ -1737,7 +1765,15 @@ impl Stats { pieces.push(util::round_num(cv, round_places)); } } else { - pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[ + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + ]); } // nullcount From 53e3b2f568fb6657b592e16facad50a2defb62cd Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 08:08:09 -0500 Subject: [PATCH 2/6] deps: bump qsv-stats to 0.24 --- Cargo.lock | 24 ++++++++++++------------ Cargo.toml | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6fa1515bd..704e41a3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4646,7 +4646,7 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror 2.0.4", + "thiserror 2.0.5", "version_check", "xxhash-rust", ] @@ -4661,7 +4661,7 @@ dependencies = [ "polars-arrow-format", "regex", "simdutf8", - "thiserror 2.0.4", + "thiserror 2.0.5", ] [[package]] @@ -5422,9 +5422,9 @@ dependencies = [ [[package]] name = "qsv-stats" -version = "0.22.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d3d4715843b98e31967047e4a8d45ea5d733c3c90ad7bde81a62aafa9743d7b" +checksum = "9b50d07636d57c4de44fa144eb44e08dc27a8cbea5ec2a5f47e07196fb548dd4" dependencies = [ "ahash", "num-traits", @@ -5525,7 +5525,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror 2.0.4", + "thiserror 2.0.5", "tokio", "tracing", ] @@ -5544,7 +5544,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.4", + "thiserror 2.0.5", "tinyvec", "tracing", "web-time", @@ -6792,11 +6792,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490" +checksum = "643caef17e3128658ff44d85923ef2d28af81bb71e0d67bbfe1d76f19a73e053" dependencies = [ - "thiserror-impl 2.0.4", + "thiserror-impl 2.0.5", ] [[package]] @@ -6812,9 +6812,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061" +checksum = "995d0bbc9995d1f19d28b7215a9352b0fc3cd3a2d2ec95c2cadc485cdedbcdde" dependencies = [ "proc-macro2", "quote", @@ -8273,7 +8273,7 @@ dependencies = [ "pbkdf2", "rand", "sha1", - "thiserror 2.0.4", + "thiserror 2.0.5", "time", "zeroize", "zopfli", diff --git a/Cargo.toml b/Cargo.toml index cab5af43b..5107aee8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -192,7 +192,7 @@ publicsuffix = { version = "2.2", optional = true } pyo3 = { version = "0.21.2", features = ["auto-initialize"], optional = true } qsv-dateparser = "0.12" qsv_docopt = "1.8" -qsv-stats = "0.22" +qsv-stats = "0.24" qsv_currency = "0.7" qsv-sniffer = { version = "0.10", default-features = false, features = [ "runtime-dispatch-simd", From 5809ddb5b7a90958e91258acaf2d06c18066f4a6 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:01:29 -0500 Subject: [PATCH 3/6] tests: update `stats` test files to account for new hashes and new means columns --- .../test/boston311-10-boolean-1or0-stats.csv | 70 +++++++++---------- .../test/boston311-10-boolean-tf-stats.csv | 70 +++++++++---------- ...boston311-100-everything-8places-stats.csv | 2 +- ...-everything-date-stats-variance-stddev.csv | 68 +++++++++--------- .../boston311-100-everything-date-stats.csv | 2 +- ...ton311-100-everything-datenotime-stats.csv | 2 +- ...hing-inferdates-defaultwhitelist-stats.csv | 2 +- .../boston311-100-everything-nodate-stats.csv | 2 +- resources/test/boston311-100-stats.csv | 2 +- .../boston311-100-with-nonascii-stats.csv | 2 +- 10 files changed, 111 insertions(+), 111 deletions(-) diff --git a/resources/test/boston311-10-boolean-1or0-stats.csv b/resources/test/boston311-10-boolean-1or0-stats.csv index 30ae28b7c..d52d28d4b 100644 --- a/resources/test/boston311-10-boolean-1or0-stats.csv +++ b/resources/test/boston311-10-boolean-1or0-stats.csv @@ -1,35 +1,35 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value -case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,14747.2697,217481962.3498,0,0,,0,10, -open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,0,,0,10, -target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,4,,0.4,6, -closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,5,,0.5,6, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,0,,0,2, -case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,0,,0,2, -case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,0.5,0.1581,0.5,0.25,100,0,,0,2, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,0,,0,6, -case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,0,,0,8, -subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,0,,0,5, -reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,0,,0,7, -type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,0,,0,8, -queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,0,,0,7, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,0,,0,5, -submittedphoto,NULL,,,,,,,0,0,,,,,,,,10,,1,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,10,,1,1, -location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,0,,0,10, -fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,0,,0,4, -pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,0,,0,6, -city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,0,,0,6, -police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,0,,0,6, -neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,0,,0,8, -neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,0,,0,7, -ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,0,,0,8, -precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,0,,0,9, -location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,1,,0.1,10, -location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,1,,0.1,8, -latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,0.0252,0.0006,0.0595,0,4,0,9, -longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,0.0246,0.0006,-0.0346,0,4,0,10, -source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,0,,0,2, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,10 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,30 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,3887 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,d9cd6cf751bdfdaaee1cca903f0e7f4182bc39a2d91a59a8438616642c47b590 +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value +case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, +open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10, +target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6, +closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2, +case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2, +case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,0.5,0.1581,0,,0.5,0.25,100,0,,0,2, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6, +case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8, +subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5, +reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7, +type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8, +queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5, +submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, +location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10, +fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4, +pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6, +city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6, +police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6, +neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8, +neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7, +ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8, +precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9, +location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10, +location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8, +latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, +longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, +source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3887 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,1a4c2204a401f6791b6e5efde990955e1b6c59aec5b3de300686fadb63ee457b diff --git a/resources/test/boston311-10-boolean-tf-stats.csv b/resources/test/boston311-10-boolean-tf-stats.csv index 0666ddfd1..f0b085e95 100644 --- a/resources/test/boston311-10-boolean-tf-stats.csv +++ b/resources/test/boston311-10-boolean-tf-stats.csv @@ -1,35 +1,35 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value -case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,14747.2697,217481962.3498,0,0,,0,10, -open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,0,,0,10, -target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,4,,0.4,6, -closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,5,,0.5,6, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,0,,0,2, -case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,0,,0,2, -case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,,,,,,0,,0,2, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,0,,0,6, -case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,0,,0,8, -subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,0,,0,5, -reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,0,,0,7, -type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,0,,0,8, -queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,0,,0,7, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,0,,0,5, -submittedphoto,NULL,,,,,,,0,0,,,,,,,,10,,1,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,10,,1,1, -location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,0,,0,10, -fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,0,,0,4, -pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,0,,0,6, -city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,0,,0,6, -police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,0,,0,6, -neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,0,,0,8, -neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,0,,0,7, -ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,0,,0,8, -precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,0,,0,9, -location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,1,,0.1,10, -location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,1,,0.1,8, -latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,0.0252,0.0006,0.0595,0,4,0,9, -longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,0.0246,0.0006,-0.0346,0,4,0,10, -source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,0,,0,2, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,10 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,30 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,3922 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,782abcbf6f159759b8ca1af34c49ddefff6bc01528c4994a05d0ec6314f49852 +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value +case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, +open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10, +target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6, +closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2, +case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2, +case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,,,,,,,,0,,0,2, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6, +case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8, +subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5, +reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7, +type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8, +queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5, +submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, +location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10, +fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4, +pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6, +city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6, +police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6, +neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8, +neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7, +ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8, +precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9, +location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10, +location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8, +latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, +longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, +source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3922 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,dd97ad46b4b34efa66aa634d6c54188eebaf44ef5aaa5dde38180c3435a9ddaa diff --git a/resources/test/boston311-100-everything-8places-stats.csv b/resources/test/boston311-100-everything-8places-stats.csv index f53a29e2d..b886a6471 100644 --- a/resources/test/boston311-100-everything-8places-stats.csv +++ b/resources/test/boston311-100-everything-8places-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3390cadbb4574cc52e0542eddba3a24e770788dd9826e706526fbda65ce4a0cf +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,bc8660240b24f489683c31066951cf6ccd248c0d688589a42963395372e03d43 diff --git a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv index 6d44c4fd2..969a054af 100644 --- a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv +++ b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv @@ -1,34 +1,34 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value -case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,790.552,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, -open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,2022-01-04T07:07:45.050+00:00,0.5568,5.568,31.00259,0.0293,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, -target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,2022-01-17T03:14:16.404+00:00,2.86258,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,1.71655,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, -case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, -fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, -pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, -city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, -police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0.0031,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0.0031,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, -source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a7c42f56ed489cec893c62fc171ea3b2ab6280651c7779d398e0de51fef92958 +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value +case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, +open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,2022-01-04T07:07:45.050+00:00,0.5568,18996.29623,18996.29542,5.568,31.00259,0.0293,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, +target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,2022-01-17T03:14:16.404+00:00,2.86258,19009.11578,19009.0967,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, +closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,1.71655,19000.04255,19000.036,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, +case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, +pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, +city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, +police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d diff --git a/resources/test/boston311-100-everything-date-stats.csv b/resources/test/boston311-100-everything-date-stats.csv index 64a5785a2..85e35cd78 100644 --- a/resources/test/boston311-100-everything-date-stats.csv +++ b/resources/test/boston311-100-everything-date-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a7c42f56ed489cec893c62fc171ea3b2ab6280651c7779d398e0de51fef92958 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d diff --git a/resources/test/boston311-100-everything-datenotime-stats.csv b/resources/test/boston311-100-everything-datenotime-stats.csv index 2e9b0798a..cb4411fe8 100644 --- a/resources/test/boston311-100-everything-datenotime-stats.csv +++ b/resources/test/boston311-100-everything-datenotime-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,45236 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,b479d4726f9ad6e4055800859486e87d7d61a5f86ea4ee643c951c7829157a47 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4142a9338de2e31210d6673e0c0a3d2533895823fecc36700c1ee540ee637b46 diff --git a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv index 7bf310c05..8a9ebf39f 100644 --- a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv +++ b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,caec23d5db01731cc207ea42005d5c094601ae30f9677e37d0d05b3c5d87f7aa +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e7537bd732c9a6eb2d1c34855ef7b803629d03f6316a125e89f377f74e9e7fd7 diff --git a/resources/test/boston311-100-everything-nodate-stats.csv b/resources/test/boston311-100-everything-nodate-stats.csv index 53e05dd4b..4678776d5 100644 --- a/resources/test/boston311-100-everything-nodate-stats.csv +++ b/resources/test/boston311-100-everything-nodate-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5638c434c951042551ae14e656fd36aefe85a8628a899cdd3d332a8933812986 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1551f88101b999f0ca88c62c062668a881513d9b2ee4af8741855501ebcdba0e diff --git a/resources/test/boston311-100-stats.csv b/resources/test/boston311-100-stats.csv index d7d9d1eb8..c9b80a70c 100644 --- a/resources/test/boston311-100-stats.csv +++ b/resources/test/boston311-100-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,a7c42f56ed489cec893c62fc171ea3b2ab6280651c7779d398e0de51fef92958 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d diff --git a/resources/test/boston311-100-with-nonascii-stats.csv b/resources/test/boston311-100-with-nonascii-stats.csv index e855fa73f..28aa0469f 100644 --- a/resources/test/boston311-100-with-nonascii-stats.csv +++ b/resources/test/boston311-100-with-nonascii-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,47704 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,cd3ea247f78bbd80b0ca282493939239efa2fb39051e061bead7d7638c917f54 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,f476d634e6251c59b71ed07431e30fb654b06da7095629657fa6fed28ea7adcf From a3ad0bc90444d536aaf880ade906ea7fe844ff92 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:02:24 -0500 Subject: [PATCH 4/6] refactor: `stats` fingerprint hash should be the first 22 columns which are always the same --- src/cmd/stats.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 3b3f8d725..3bf51fbce 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -832,12 +832,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // Compute hash of stats for data fingerprinting let stats_hash = { - let mut hash_input = Vec::with_capacity(16); + let mut hash_input = Vec::with_capacity(22); // First, create a stable representation of the stats for record in &stats_br_vec { - // Take first 16 columns only - for field in record.iter().take(16) { + // Take first 22 columns only + for field in record.iter().take(22) { let s = String::from_utf8_lossy(field); // Standardize number format if let Ok(f) = s.parse::() { From 63ffb0bb5b47975c57d163afc83c6e50d6a91d9d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:03:24 -0500 Subject: [PATCH 5/6] tests: update `stats` and `index` tests to account for new mean columns --- tests/test_index.rs | 22 ++++++++-- tests/test_stats.rs | 104 ++++++++++++++++++++++++++++---------------- 2 files changed, 84 insertions(+), 42 deletions(-) diff --git a/tests/test_index.rs b/tests/test_index.rs index 782366574..416a69473 100644 --- a/tests/test_index.rs +++ b/tests/test_index.rs @@ -79,13 +79,15 @@ fn index_outdated_stats() { "avg_length", "mean", "sem", + "geometric_mean", + "harmonic_mean", "stddev", "variance", "cv", "nullcount", "max_precision", "sparsity", - "qsv__value", + "qsv__value" ], svec![ "letter", @@ -105,10 +107,12 @@ fn index_outdated_stats() { "", "", "", - "0", + "", "", "0", "", + "0", + "" ], svec![ "number", @@ -125,13 +129,15 @@ fn index_outdated_stats() { "1", "2", "0.4714", + "1.8171", + "1.6364", "0.8165", "0.6667", "40.8248", "0", "", "0", - "", + "" ], svec![ "qsv__rowcount", @@ -154,6 +160,8 @@ fn index_outdated_stats() { "", "", "", + "", + "", "3" ], svec![ @@ -177,6 +185,8 @@ fn index_outdated_stats() { "", "", "", + "", + "", "2" ], svec![ @@ -200,6 +210,8 @@ fn index_outdated_stats() { "", "", "", + "", + "", "26" ], svec![ @@ -223,7 +235,9 @@ fn index_outdated_stats() { "", "", "", - "597e467c8f605d260295bc4e059ffb683ac06139c701eda3bf6d5df7d6b1bc8f" + "", + "", + "09b55353162931d7a4617e04939bee06546049eae0b4b5969021ef02572a2193" ], ]; diff --git a/tests/test_stats.rs b/tests/test_stats.rs index ee5a65032..3616ef442 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -622,7 +622,8 @@ fn stats_prefer_dmy() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-stats.csv"); @@ -647,7 +648,8 @@ fn stats_prefer_mdy() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -672,7 +674,8 @@ fn stats_rounding() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-everything-8places-stats.csv"); @@ -715,7 +718,8 @@ fn stats_no_date_inference() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-everything-nodate-stats.csv"); @@ -741,7 +745,8 @@ fn stats_with_date_inference() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-everything-date-stats.csv"); @@ -763,7 +768,8 @@ fn stats_with_date_inference_default_whitelist() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = @@ -813,7 +819,8 @@ fn stats_with_date_type() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-everything-datenotime-stats.csv"); @@ -921,7 +928,8 @@ fn stats_cache() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-stats.csv"); @@ -959,7 +967,8 @@ fn stats_cache_negative_threshold() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-stats.csv"); @@ -996,7 +1005,8 @@ fn stats_cache_negative_threshold_unmet() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-stats.csv"); @@ -1036,7 +1046,8 @@ fn stats_cache_negative_threshold_five() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-stats.csv"); @@ -1106,7 +1117,8 @@ fn stats_is_ascii() { // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|stddev|sem|cv/").arg("in2.csv"); + cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); let expected2 = wrk.load_test_resource("boston311-100-with-nonascii-stats.csv"); @@ -1205,6 +1217,8 @@ fn stats_zero_cv() { "avg_length", "mean", "sem", + "geometric_mean", + "harmonic_mean", "stddev", "variance", "cv", @@ -1228,6 +1242,8 @@ fn stats_zero_cv() { "1", "3", "0.6325", + "2.6052", + "2.1898", "1.4142", "2", "47.1405", @@ -1251,6 +1267,8 @@ fn stats_zero_cv() { "1.8", "0", "3.1623", + "0", + "", "7.0711", "50", "", @@ -1274,6 +1292,8 @@ fn stats_zero_cv() { "5", "0", "28.8472", + "0", + "", "64.5043", "4160.801", "", @@ -1284,7 +1304,7 @@ fn stats_zero_cv() { ], svec![ "col4", "Integer", "", "935", "-900", "1000", "1900", "Unsorted", "1", "4", "14", - "2.8", "187", "304.3603", "680.5703", "463176", "363.9414", "0", "", "0", "" + "2.8", "187", "304.3603", "0", "", "680.5703", "463176", "363.9414", "0", "", "0", "" ], svec![ "qsv__rowcount", @@ -1307,6 +1327,8 @@ fn stats_zero_cv() { "", "", "", + "", + "", "5" ], svec![ @@ -1330,6 +1352,8 @@ fn stats_zero_cv() { "", "", "", + "", + "", "4" ], svec![ @@ -1353,6 +1377,8 @@ fn stats_zero_cv() { "", "", "", + "", + "", "93" ], svec![ @@ -1376,7 +1402,9 @@ fn stats_zero_cv() { "", "", "", - "1080eea697a7966a96fcfdcdaee4ae4d1355bce057cae6f27d8bba4684902ba1" + "", + "", + "228f039bafd53f7562c1418b74114a3a03f9c64e7be4c6965e67f2e7a3938267" ], ]; assert_eq!(got, expected); @@ -1406,14 +1434,14 @@ fn stats_output_tab_delimited() { wrk.assert_success(&mut cmd); let got = std::fs::read_to_string(out_file).unwrap(); - let expected = r#"field type is_ascii sum min max range sort_order min_length max_length sum_length avg_length mean sem stddev variance cv nullcount max_precision sparsity qsv__value -col1 Integer 15 1 5 4 Ascending 1 1 5 1 3 0.6325 1.4142 2 47.1405 0 0 -col2 Integer 10644 0 4321 4321 Descending 1 4 17 3.4 2128.8 685.6979 1533.267 2350907.76 72.0249 0 0 -col3 String true 01 10 Ascending 2 2 10 2 0 0 -qsv__rowcount 5 -qsv__columncount 3 -qsv__filesize_bytes 62 -qsv__fingerprint_hash b1d8236344b9e74711338567c4cc54a328cc803762aa2826ff00e9a1924ea407 + let expected = r#"field type is_ascii sum min max range sort_order min_length max_length sum_length avg_length mean sem geometric_mean harmonic_mean stddev variance cv nullcount max_precision sparsity qsv__value +col1 Integer 15 1 5 4 Ascending 1 1 5 1 3 0.6325 2.6052 2.1898 1.4142 2 47.1405 0 0 +col2 Integer 10644 0 4321 4321 Descending 1 4 17 3.4 2128.8 685.6979 0 1533.267 2350907.76 72.0249 0 0 +col3 String true 01 10 Ascending 2 2 10 2 0 0 +qsv__rowcount 5 +qsv__columncount 3 +qsv__filesize_bytes 62 +qsv__fingerprint_hash a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c "#; assert_eq!(got, expected); } @@ -1442,14 +1470,14 @@ fn stats_output_ssv_delimited() { wrk.assert_success(&mut cmd); let got = std::fs::read_to_string(out_file).unwrap(); - let expected = r#"field;type;is_ascii;sum;min;max;range;sort_order;min_length;max_length;sum_length;avg_length;mean;sem;stddev;variance;cv;nullcount;max_precision;sparsity;qsv__value -col1;Integer;;15;1;5;4;Ascending;1;1;5;1;3;0.6325;1.4142;2;47.1405;0;;0; -col2;Integer;;10644;0;4321;4321;Descending;1;4;17;3.4;2128.8;685.6979;1533.267;2350907.76;72.0249;0;;0; -col3;String;true;;01;10;;Ascending;2;2;10;2;;;;;;0;;0; -qsv__rowcount;;;;;;;;;;;;;;;;;;;;5 -qsv__columncount;;;;;;;;;;;;;;;;;;;;3 -qsv__filesize_bytes;;;;;;;;;;;;;;;;;;;;62 -qsv__fingerprint_hash;;;;;;;;;;;;;;;;;;;;b1d8236344b9e74711338567c4cc54a328cc803762aa2826ff00e9a1924ea407 + let expected = r#"field;type;is_ascii;sum;min;max;range;sort_order;min_length;max_length;sum_length;avg_length;mean;sem;geometric_mean;harmonic_mean;stddev;variance;cv;nullcount;max_precision;sparsity;qsv__value +col1;Integer;;15;1;5;4;Ascending;1;1;5;1;3;0.6325;2.6052;2.1898;1.4142;2;47.1405;0;;0; +col2;Integer;;10644;0;4321;4321;Descending;1;4;17;3.4;2128.8;685.6979;0;;1533.267;2350907.76;72.0249;0;;0; +col3;String;true;;01;10;;Ascending;2;2;10;2;;;;;;;;0;;0; +qsv__rowcount;;;;;;;;;;;;;;;;;;;;;;5 +qsv__columncount;;;;;;;;;;;;;;;;;;;;;;3 +qsv__filesize_bytes;;;;;;;;;;;;;;;;;;;;;;62 +qsv__fingerprint_hash;;;;;;;;;;;;;;;;;;;;;;a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c "#; assert_eq!(got, expected); } @@ -1481,14 +1509,14 @@ fn stats_output_csvsz_delimited() { cmd.arg("decompress").arg(out_file.clone()); let got: String = wrk.stdout(&mut cmd); - let expected = r#"field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,stddev,variance,cv,nullcount,max_precision,sparsity,qsv__value -col1,Integer,,15,1,5,4,Ascending,1,1,5,1,3,0.6325,1.4142,2,47.1405,0,,0, -col2,Integer,,10644,0,4321,4321,Descending,1,4,17,3.4,2128.8,685.6979,1533.267,2350907.76,72.0249,0,,0, -col3,String,true,,01,10,,Ascending,2,2,10,2,,,,,,0,,0, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,5 -qsv__columncount,,,,,,,,,,,,,,,,,,,,3 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,62 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,b1d8236344b9e74711338567c4cc54a328cc803762aa2826ff00e9a1924ea407"#; + let expected = r#"field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,qsv__value +col1,Integer,,15,1,5,4,Ascending,1,1,5,1,3,0.6325,2.6052,2.1898,1.4142,2,47.1405,0,,0, +col2,Integer,,10644,0,4321,4321,Descending,1,4,17,3.4,2128.8,685.6979,0,,1533.267,2350907.76,72.0249,0,,0, +col3,String,true,,01,10,,Ascending,2,2,10,2,,,,,,,,0,,0, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,5 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,3 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,62 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c"#; assert_eq!(got, expected); } From 02a4164b5c834834131a5075186c387dde72ba97 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:08:38 -0500 Subject: [PATCH 6/6] deps: bump qsv-stats from 0.24 to 0.25; also ensure polars is at 0.45 --- Cargo.lock | 152 ++++++++++++++++++++++++++++++----------------------- Cargo.toml | 8 +-- 2 files changed, 90 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 704e41a3c..7bb812d33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2254,9 +2254,9 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fiat-crypto" @@ -3417,9 +3417,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06d7a061b7feb8a4b233a4d90280d13e0965c4e0181566e9ad61af98e210ca9d" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -3430,9 +3430,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0029bdee2a94a6c4393a86f7e6921c90f234218fa4f2154bc001c92bc51e8bf5" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -3441,9 +3441,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "440a2398a08def518ff962b69e7146246c53bad8090e2b75d95fd5a469338958" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -3451,18 +3451,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3100209587e35b13881068ce5a41241b112e0500b4d847ba16be172829c112ff" +checksum = "ee72ef7886d94f30741743126c1ec123564749ee339281b9834d0e913f2d40fe" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27adf08e2f91ff44ab54bbac0c4579303f0865730870f91b58c044df821f114c" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -3471,9 +3471,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b4e5d27d742da13f013765f849efc0c4b6173e0e64404546475eb5ee0931e2c" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -4533,8 +4533,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0a108fffe97cb3a79aa87908ac88c56059cf2466e89ae304de163836a853a6" dependencies = [ "getrandom", "polars-arrow", @@ -4552,8 +4553,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c802be02fec184ca1873d933868c39d5ccd901018dab833c8ddf23749e4df278" dependencies = [ "ahash", "atoi", @@ -4596,8 +4598,9 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92735f5939f97b418954a08d927058fed633792f7bb5292ab73010475b0d8d03" dependencies = [ "atoi_simd", "bytemuck", @@ -4617,8 +4620,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c4562086cd2da911a62e8cb25cf10cbf840ac7d77d985ec8a26e7e8e49b684" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4646,28 +4650,30 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror 2.0.5", + "thiserror 2.0.6", "version_check", "xxhash-rust", ] [[package]] name = "polars-error" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1f394c2c21f1bd3e14b5dfb6a7ebe5f4a5c215c24f74f4e971ccbdce098da63" dependencies = [ "avro-schema", "object_store", "polars-arrow-format", "regex", "simdutf8", - "thiserror 2.0.5", + "thiserror 2.0.6", ] [[package]] name = "polars-expr" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37c7c250b97c42a5195f3da6534727c1b6158ceb0f526d25d02f183b42760e6c" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4689,8 +4695,9 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6d8662f5edb590528fb3bea10b4e7a8ca5387927f38b3a82ae3d772257e6c55" dependencies = [ "ahash", "async-trait", @@ -4737,8 +4744,9 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5858e97d6c1441fabed84dc2a1fa9bf9180f799712c4ee2ff790c825a0e31c3a" dependencies = [ "ahash", "chrono", @@ -4759,8 +4767,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2080ff9c31975ced0aab4424c557a503e108417342618d506a9120f24e83d83e" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4786,8 +4795,9 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce2697d14830103d912d7b0cb6dbf2244a5bb0e0760d1828a7bc5fba566a93cb" dependencies = [ "futures", "memmap2", @@ -4807,8 +4817,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24e56e34b484eeb19a486a723525bb0367b2a73ba1d361f1cbe922b0e526f22f" dependencies = [ "ahash", "argminmax", @@ -4842,8 +4853,9 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d683d41c377898b4e982a814e1bc1448b3dc01eedf6ea5a1322f68d56f9414" dependencies = [ "ahash", "async-stream", @@ -4880,8 +4892,9 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cf63c2f7988390c6dc895202241431b638d8edcbef36122a0412673ac66fef" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -4906,8 +4919,9 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c53fef25e54f0a603923730a8bf5dd65ed991a73bda7fd2f9c9e9ec4f3d71cf" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4941,8 +4955,9 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "087ac5ef877a56da4e9ef4280e08eb28f7916b2e1c578749798f03928c83ec09" dependencies = [ "bitflags 2.6.0", "bytemuck", @@ -4954,8 +4969,9 @@ dependencies = [ [[package]] name = "polars-schema" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21aa4bd255619a28b88478c5e683f9988e22a4dc99c62d137bd289479401c4f0" dependencies = [ "indexmap", "polars-error", @@ -4966,8 +4982,9 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a8067044d13ed9354a89d2af2aa6f7a9c2bcb1ad1a425cdedd8d492972477d" dependencies = [ "hex", "once_cell", @@ -4987,8 +5004,9 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1ea6090657846dc88833a9df5d33ae423e76746fd595ef89ef4169cd34a838" dependencies = [ "atomic-waker", "crossbeam-deque", @@ -5016,8 +5034,9 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0753b41e88e0161f1a1e38bfe744936ea377dd4bfe4f0b596e5b9a4d0994cc6" dependencies = [ "atoi", "bytemuck", @@ -5038,8 +5057,9 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a193495a7f88c149b0f5afa8e96a9b41376034a33cbd436a95e86b963cc3e81b" dependencies = [ "ahash", "bytemuck", @@ -5422,9 +5442,9 @@ dependencies = [ [[package]] name = "qsv-stats" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b50d07636d57c4de44fa144eb44e08dc27a8cbea5ec2a5f47e07196fb548dd4" +checksum = "83c5455540fba94d3c28525e8aea8b168c9cea3064682a8762fe3683373c2da8" dependencies = [ "ahash", "num-traits", @@ -5525,7 +5545,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror 2.0.5", + "thiserror 2.0.6", "tokio", "tracing", ] @@ -5544,7 +5564,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.5", + "thiserror 2.0.6", "tinyvec", "tracing", "web-time", @@ -6792,11 +6812,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.5" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643caef17e3128658ff44d85923ef2d28af81bb71e0d67bbfe1d76f19a73e053" +checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47" dependencies = [ - "thiserror-impl 2.0.5", + "thiserror-impl 2.0.6", ] [[package]] @@ -6812,9 +6832,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.5" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "995d0bbc9995d1f19d28b7215a9352b0fc3cd3a2d2ec95c2cadc485cdedbcdde" +checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312" dependencies = [ "proc-macro2", "quote", @@ -8273,7 +8293,7 @@ dependencies = [ "pbkdf2", "rand", "sha1", - "thiserror 2.0.5", + "thiserror 2.0.6", "time", "zeroize", "zopfli", diff --git a/Cargo.toml b/Cargo.toml index 5107aee8b..b8c8f3a10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -163,7 +163,7 @@ mlua = { version = "0.10", features = [ num_cpus = "1" odht = "0.3" phf = { version = "0.11", features = ["macros"], optional = true } -polars = { version = "0.44.2", features = [ +polars = { version = "0.45", features = [ "asof_join", "avro", "avx512", @@ -192,7 +192,7 @@ publicsuffix = { version = "2.2", optional = true } pyo3 = { version = "0.21.2", features = ["auto-initialize"], optional = true } qsv-dateparser = "0.12" qsv_docopt = "1.8" -qsv-stats = "0.24" +qsv-stats = "0.25" qsv_currency = "0.7" qsv-sniffer = { version = "0.10", default-features = false, features = [ "runtime-dispatch-simd", @@ -329,9 +329,9 @@ strum_macros = { git = "https://github.com/dathere/strum", branch = "bump-phf-to # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=a6ca94d +# QSV_POLARS_REV= # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.16.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "a6ca94d" } +# polars = { git = "https://github.com/pola-rs/polars", rev = "a6ca94d" } [features] default = ["mimalloc"]