From 5bb5eee14186073ac1fd545f2d27f2561d48536d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:19:54 -0400 Subject: [PATCH 1/5] `extdedup`: now support two modes - LINE mode and CSV mode --- src/cmd/extdedup.rs | 204 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 167 insertions(+), 37 deletions(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index 2e9ae300f..f96f872ba 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -7,8 +7,14 @@ to sort the CSV first before deduping it. This allows it to run in constant memory and the output will retain the input sort order. -Also, this command is not specific to CSV data, it deduplicates any text file on a -line-by-line basis. +This command has TWO modes of operation. + + * CSV MODE + when --select is set, it dedupes based on the given column/s. See `qsv select --help` + for select syntax details. + * LINE MODE + when --select is NOT set, it deduplicates any input text file (not just CSVs) on a + line-by-line basis. A duplicate count will be sent to . @@ -17,6 +23,10 @@ Usage: qsv extdedup --help extdedup options: + -s, --select Select a subset of columns to dedup. + Note that the outputs will remain at the full width of the CSV. + If --select is NOT set, extdedup will work in LINE MODE, sorting + the input as a text file on a line-by-line basis. --no-output Do not write deduplicated output to . Use this if you only want to know the duplicate count. -D, --dupes-output Write duplicates to . @@ -25,9 +35,20 @@ extdedup options: duplicate separated by a tab from the duplicate line itself. -H, --human-readable Comma separate duplicate count. --memory-limit The maximum amount of memory to buffer the on-disk hash table. - This is a percentage of total memory. [default: 10] + If less than 50, this is a percentage of total memory. + If more than 50, this is the memory in MB to allocate, capped + at 90 percent of total memory. + [default: 10] Common options: + CSV MODE ONLY: + -n, --no-headers When set, the first row will not be interpreted + as headers. That is, it will be sorted with the rest + of the rows. Otherwise, the first row will always + appear as the header row in the output. + -d, --delimiter The field delimiter for reading CSV data. + Must be a single character. (default: ,) + -h, --help Display this message -Q, --quiet Do not print duplicate count to stderr. "#; @@ -41,17 +62,25 @@ use indicatif::HumanCount; use serde::Deserialize; use sysinfo::System; -// use sysinfo::System::sysinfo; -use crate::{config, odhtcache, util, CliResult}; +use crate::{ + config, + config::{Config, Delimiter}, + odhtcache, + select::SelectColumns, + util, CliResult, +}; #[derive(Deserialize)] struct Args { arg_input: Option, + flag_select: Option, arg_output: Option, + flag_no_headers: bool, + flag_delimiter: Option, flag_no_output: bool, flag_dupes_output: Option, flag_human_readable: bool, - flag_memory_limit: Option, + flag_memory_limit: Option, flag_quiet: bool, } @@ -60,19 +89,101 @@ const MEMORY_LIMITED_BUFFER: u64 = 100 * 1_000_000; // 100 MB pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; - // memory buffer to use for on-disk hash table, - // if we can detect the total memory, use 10% of it by default - // and up to --memory-limit (capped at 50%), - // otherwise, if we cannot detect the free memory use a default of 100 MB - let mem_limited_buffer = if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys = System::new(); - sys.refresh_memory(); - (sys.total_memory() * 1000) / u8::min(args.flag_memory_limit.unwrap_or(10), 50) as u64 + // Set the memory buffer size for the on-disk hash table based on --memory-limit + // and system capabilities. + let mem_limited_buffer_bytes = calculate_memory_limit(args.flag_memory_limit); + log::info!("{mem_limited_buffer_bytes} bytes used for memory buffer for on-disk hash table..."); + + let quiet = args.flag_quiet; + let human_readable = args.flag_human_readable; + + let dupes_count = if args.flag_select.is_some() { + dedup_csv(args, mem_limited_buffer_bytes)? } else { - MEMORY_LIMITED_BUFFER + dedup_lines(args, mem_limited_buffer_bytes)? }; - log::info!("{mem_limited_buffer} bytes used for memory buffer for on-disk hash table..."); + if quiet { + return Ok(()); + } + + eprintln!( + "{}", + if human_readable { + HumanCount(dupes_count).to_string() + } else { + dupes_count.to_string() + } + ); + + Ok(()) +} + +fn dedup_csv(args: Args, mem_limited_buffer: u64) -> Result { + let rconfig = Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_select.unwrap()); + + let mut rdr = rconfig.reader()?; + let mut wtr = Config::new(args.arg_output.as_ref()).writer()?; + let dupes_output = args.flag_dupes_output.is_some(); + let mut dupewtr = Config::new(args.flag_dupes_output.as_ref()).writer()?; + + let headers = rdr.byte_headers()?.clone(); + if dupes_output { + let mut dupe_headers = csv::ByteRecord::new(); + dupe_headers.push_field(b"dupe_rowno"); + dupe_headers.extend(headers.iter()); + dupewtr.write_byte_record(&dupe_headers)?; + } + + let mut dedup_cache = odhtcache::ExtDedupCache::new(mem_limited_buffer); + let mut dupes_count = 0_u64; + let sel = rconfig.selection(&headers)?; + + rconfig.write_headers(&mut rdr, &mut wtr)?; + + // Pre-allocate and reuse buffers + let mut key = String::with_capacity(20); + let mut utf8_string = String::with_capacity(20); + let mut dupe_row = csv::ByteRecord::new(); + let mut curr_row = csv::ByteRecord::new(); + + for (row_idx, row) in rdr.byte_records().enumerate() { + curr_row.clone_from(&row?); + key.clear(); + for field in sel.select(&curr_row) { + if let Ok(s_utf8) = simdutf8::basic::from_utf8(field) { + key.push_str(s_utf8); + } else { + utf8_string.clear(); + utf8_string.push_str(&String::from_utf8_lossy(field)); + key.push_str(&utf8_string); + } + } + + if dedup_cache.contains(&key) { + dupes_count += 1; + if dupes_output { + dupe_row.clear(); + dupe_row.push_field((row_idx + 1).to_string().as_bytes()); + dupe_row.extend(curr_row.iter()); + dupewtr.write_byte_record(&dupe_row)?; + } + } else { + dedup_cache.insert(&key); + wtr.write_byte_record(&curr_row)?; + } + } + + dupewtr.flush()?; + wtr.flush()?; + + Ok(dupes_count) +} + +fn dedup_lines(args: Args, mem_limited_buffer: u64) -> Result { let input_reader: Box = match &args.arg_input { Some(input_path) => { if input_path.to_lowercase().ends_with(".sz") { @@ -88,7 +199,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }, None => Box::new(io::BufReader::new(stdin().lock())), }; - let mut output_writer: Box = match &args.arg_output { Some(output_path) => Box::new(io::BufWriter::with_capacity( config::DEFAULT_WTR_BUFFER_CAPACITY, @@ -99,9 +209,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { stdout().lock(), )), }; - let mut write_dupes = false; - #[cfg(target_family = "unix")] let mut dupes_writer = if let Some(dupes_output) = args.flag_dupes_output { write_dupes = true; @@ -115,7 +223,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { fs::File::create("/dev/null")?, ) }; - #[cfg(target_family = "windows")] let mut dupes_writer = if let Some(dupes_output) = args.flag_dupes_output { write_dupes = true; @@ -129,44 +236,67 @@ pub fn run(argv: &[&str]) -> CliResult<()> { fs::File::create("nul")?, ) }; - let mut dedup_cache = odhtcache::ExtDedupCache::new(mem_limited_buffer); - let mut dupes_count = 0_u64; - let mut line_work = String::with_capacity(100); + let mut line_work = String::with_capacity(1024); for (row_idx, line) in input_reader.lines().enumerate() { line_work.clone_from(&line?); if dedup_cache.contains(&line_work) { dupes_count += 1; if write_dupes { - dupes_writer.write_all(format!("{row_idx}\t{line_work}\n").as_bytes())?; + writeln!(dupes_writer, "{row_idx}\t{line_work}")?; } } else { dedup_cache.insert(&line_work); if args.flag_no_output { continue; } - output_writer.write_all(format!("{line_work}\n").as_bytes())?; + writeln!(output_writer, "{line_work}")?; } } - dupes_writer.flush()?; output_writer.flush()?; - if args.flag_quiet { - return Ok(()); + Ok(dupes_count) +} + +/// Determines the memory buffer size to use for on-disk hash table based on +/// the provided flag and the system's total memory. +/// +/// # Arguments +/// +/// * `flag_memory_limit` - An optional u64 value representing the user-specified memory limit. +/// +/// # Returns +/// +/// A u64 value representing the calculated memory limit in bytes. +/// +/// # Behavior +/// +/// - If the system is not supported, it returns a predefined `MEMORY_LIMITED_BUFFER` value. +/// - If `flag_memory_limit` is None, it returns the `MEMORY_LIMITED_BUFFER`. +/// - If `flag_memory_limit` is Some(limit): +/// - For limit <= 50, it's treated as a percentage of total system memory. +/// - For limit > 50, it's treated as megabytes, but capped at 90% of total system memory. +fn calculate_memory_limit(flag_memory_limit: Option) -> u64 { + if !sysinfo::IS_SUPPORTED_SYSTEM { + return MEMORY_LIMITED_BUFFER; } - eprintln!( - "{}", - if args.flag_human_readable { - HumanCount(dupes_count).to_string() - } else { - dupes_count.to_string() - } - ); + let mut sys = System::new(); + sys.refresh_memory(); + let total_memory = sys.total_memory(); - Ok(()) + #[allow(clippy::cast_precision_loss)] + match flag_memory_limit { + Some(limit) if limit <= 50 => ((total_memory as f64 * limit as f64) / 100.0) as u64, + Some(limit) => { + let limit_bytes = limit.saturating_mul(1_000_000); // Convert MB to bytes + let ninety_percent_total = (total_memory as f64 * 0.9) as u64; + std::cmp::min(limit_bytes, ninety_percent_total) + }, + None => MEMORY_LIMITED_BUFFER, + } } #[test] From f26d9dbe9df210e7644e2e65a3937dfd4688f3de Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:20:49 -0400 Subject: [PATCH 2/5] `tests`: add additional tests for `extdedup` csv mode --- tests/test_extdedup.rs | 102 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/tests/test_extdedup.rs b/tests/test_extdedup.rs index 3ab70490b..b447ecdd1 100644 --- a/tests/test_extdedup.rs +++ b/tests/test_extdedup.rs @@ -3,8 +3,8 @@ use newline_converter::dos2unix; use crate::workdir::Workdir; #[test] -fn extdedup() { - let wrk = Workdir::new("extdedup").flexible(true); +fn extdedup_linemode() { + let wrk = Workdir::new("extdedup_linemode").flexible(true); wrk.clear_contents().unwrap(); let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); @@ -23,7 +23,7 @@ fn extdedup() { } #[test] -fn extdedup_dupesoutput() { +fn extdedup_linemode_dupesoutput() { let wrk = Workdir::new("extdedup-dupes-output").flexible(true); wrk.clear_contents().unwrap(); @@ -54,3 +54,99 @@ fn extdedup_dupesoutput() { assert_eq!(dos2unix(&dupes_output), dos2unix(&expected_output)); } + +#[test] +fn extdedupe_csvmode() { + let wrk = Workdir::new("extdedup-csvmode").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args(["--select", "case_enquiry_id,open_dt,target_dt"]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-100-deduped.csv"); + wrk.create_from_string("boston311-100-deduped.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + + // 20 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("20\n")); +} + +#[test] +fn extdedupe_csvmode_dupesoutput() { + let wrk = Workdir::new("extdedup-csvmode-dupesoutput").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args([ + "--select", + "case_enquiry_id,open_dt,target_dt", + "--dupes-output", + "boston311-100-extdededuped-dupeoutput.csv", + ]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-100-deduped.csv"); + wrk.create_from_string("boston311-100-deduped.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // load dupe-output txt + let dupes_output: String = wrk.from_str(&wrk.path("boston311-100-extdededuped-dupeoutput.csv")); + + let expected_output = wrk.load_test_resource("boston311-extdedup-dupeoutput.csv"); + wrk.create_from_string("boston311-extdedup-dupeoutput.csv", &expected_output); + + assert_eq!(dos2unix(&dupes_output), dos2unix(&expected_output)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + // 20 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("20\n")); +} + +#[test] +fn extdedupe_csvmode_neighborhood() { + let wrk = Workdir::new("extdedup-csvmode-neighborhood").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args(["--select", "neighborhood"]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-extdedup-neighborhood.csv"); + wrk.create_from_string("boston311-extdedup-neighborhood.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + + // 81 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("81\n")); +} From eeefdee6bcb97ccdf44b4644f5636e65a3566747 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:24:26 -0400 Subject: [PATCH 3/5] `tests`: add extdedup csv mode test csvs --- .../test/boston311-extdedup-dupeoutput.csv | 21 +++++++++++++++++++ .../test/boston311-extdedup-neighborhood.csv | 20 ++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 resources/test/boston311-extdedup-dupeoutput.csv create mode 100644 resources/test/boston311-extdedup-neighborhood.csv diff --git a/resources/test/boston311-extdedup-dupeoutput.csv b/resources/test/boston311-extdedup-dupeoutput.csv new file mode 100644 index 000000000..11b47dd80 --- /dev/null +++ b/resources/test/boston311-extdedup-dupeoutput.csv @@ -0,0 +1,21 @@ +dupe_rowno,case_enquiry_id,open_dt,target_dt,closed_dt,ontime,case_status,closure_reason,case_title,subject,reason,type,queue,department,submittedphoto,closedphoto,location,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_street_name,location_zipcode,latitude,longitude,source +40,101004154423,2022-01-31 08:05:00,,,ONTIME,Open, ,Sidewalk Cover / Manhole,Boston Water & Sewer Commission,Sidewalk Cover / Manhole,Sidewalk Cover / Manhole,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,8 Putnam St Charlestown MA 02129,3,1A,1,A15,Charlestown,2,Ward 2,0201,8 Putnam St,02129,42.3735,-71.0599,Constituent Call +46,101004114154,2022-01-02 16:20:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,PWDx_Graffiti,PWDx,,,600 Atlantic Ave Boston MA 02210,3,1C,2,A1,Downtown / Financial District,3,Ward 3,0306,600 Atlantic Ave,02210,42.3527,-71.0536,Citizens Connect App +51,101004114795,2022-01-03 12:29:00,2022-03-07 12:29:41,,OVERDUE,Open, ,Graffiti: Ward 8 0803 ,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP,,,2000A Washington St Roxbury MA 02118,7,10B,7,D4,Roxbury,13,Ward 8,0803,2000A Washington St,02118,42.3333,-71.0797,Constituent Call +57,101004114016,2022-01-02 13:22:10,2022-01-04 08:30:00,2022-01-02 20:24:18,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 20:24:18 EST 2022 Resolved Has been cleaned up ,Requests for Street Cleaning,Public Works Department,Street Cleaning,Requests for Street Cleaning,PWDx_District 03: North Dorchester,PWDx,https://311.boston.gov/media/boston/report/photos/61d1ed4105bbcf180c2a2d66/report.jpg,,71 Willow Ct Dorchester MA 02125,6,03,2,C6,Dorchester,5,07,0708,71 Willow Ct,02125,42.3246,-71.0636,Citizens Connect App +58,101004113811,2022-01-02 08:01:29,2022-01-04 08:30:00,2022-01-03 05:59:50,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 05:59:50 EST 2022 Resolved ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 10A: Roxbury,PWDx,,,INTERSECTION of Sunnyside St & Centre St Jamaica Plain MA ,9,10A,6,E13,Jamaica Plain,11,10,1009,INTERSECTION Sunnyside St & Centre St,,42.3594,-71.0587,City Worker App +59,101004113906,2022-01-02 10:32:35,2022-01-03 10:32:34,2022-01-03 06:44:23,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 06:44:23.4 Duplicate of Existing Case ,Traffic Signal Inspection,Transportation - Traffic Division,Signs & Signals,Traffic Signal Inspection,BTDT_Traffic Signal_Repair,BTDT,https://311.boston.gov/media/boston/report/photos/61d1c58205bbcf180c2a1816/report.jpg,,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,8,07,4,B3,Dorchester,7,17,1704,INTERSECTION Gallivan Blvd & Washington St,,42.3594,-71.0587,Citizens Connect App +62,101004114033,2022-01-02 13:38:41,2022-01-05 08:30:00,2022-01-03 07:08:35,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 07:08:35 EST 2022 Resolved No violation found at this time today is trash day. ,Improper Storage of Trash (Barrels),Public Works Department,Code Enforcement,Improper Storage of Trash (Barrels),PWDx_Code Enforcement,PWDx,https://311.boston.gov/media/boston/report/photos/61d1f12405bbcf180c2a3082/report.jpg,,INTERSECTION of Lewis St & North St Boston MA ,3,1B,1,A1,Downtown / Financial District,3,3,,INTERSECTION Lewis St & North St,,42.3594,-71.0587,Citizens Connect App +65,101004113637,2022-01-01 17:24:56,2022-01-04 08:30:00,2022-01-03 00:03:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 00:03:27.62 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,353-361 Athens St South Boston MA 02127,6,05,2,C6,South Boston / South Boston Waterfront,5,Ward 6,0604,353-361 Athens St,02127,42.3369,-71.0471,Citizens Connect App +66,101004114724,2022-01-03 11:36:21,,2022-01-04 16:31:31,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 16:31:31.297 Bulk Item Automation ,Schedule Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup SS,PWDx_Schedule a Bulk Item Pickup,PWDx,,,352 Riverway Boston MA 02115,4,10A,8,B2,Mission Hill,14,Ward 10,1004,352 Riverway,02115,42.3335,-71.1113,Self Service +71,101004113512,2022-01-01 12:43:50,2022-01-31 12:43:50,2022-01-03 10:46:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:46:27.983 Case Noted BTD will investigate. Thank you for contacting 311 and BTD. ,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,New Sign Crosswalk or Pavement Marking,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT,,,43 Oakview Ter Jamaica Plain MA 02130,9,02,6,E13,Jamaica Plain,11,Ward 19,1901,43 Oakview Ter,02130,42.3188,-71.1092,Self Service +79,101004114807,2022-01-03 12:35:00,,2022-01-10 16:30:33,ONTIME,Closed,Case Closed. Closed date : 2022-01-10 16:30:33.11 Bulk Item Automation ,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,,,21 Ellington St Dorchester MA 02121,7,03,4,B3,Greater Mattapan,13,Ward 14,1403,21 Ellington St,02121,42.3021,-71.0844,Constituent Call +83,101004113526,2022-01-01 13:14:52,2022-01-04 08:30:00,2022-01-02 06:43:42,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 06:43:42 EST 2022 Resolved Trash removed ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 10B: Roxbury,PWDx,,,16 Circuit St Roxbury MA 02119,7,10B,7,B2,Roxbury,13,Ward 12,1203,16 Circuit St,02119,42.3235,-71.0852,City Worker App +87,101004114108,2022-01-02 15:00:52,2022-01-04 08:30:00,2022-01-02 23:40:14,ONTIME,Closed,Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,https://311.boston.gov/media/boston/report/photos/61d2046805bbcf180c2a418d/report.jpg,,INTERSECTION of Nassau St & Washington St Boston MA ,4,1C,2,A1,Downtown / Financial District,4,3,0308,INTERSECTION Nassau St & Washington St,,42.3594,-71.0587,Citizens Connect App +88,101004114783,2022-01-03 12:19:00,2022-01-04 12:19:43,2022-01-03 14:05:26,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 14:05:26.86 Case Resolved Area ticketed ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,32 Mount Vernon St Dorchester MA 02125,6,03,2,C6,Dorchester,5,Ward 7,0709,32 Mount Vernon St,02125,42.322,-71.0573,Constituent Call +89,101004113721,2022-01-01 21:31:54,2022-01-31 21:31:54,2022-01-04 08:34:40,ONTIME,Closed,Case Closed. Closed date : Tue Jan 04 08:34:40 EST 2022 Noted Investigating area will continue monitoring. ,Rodent Activity,Inspectional Services,Environmental Services,Rodent Activity,ISD_Environmental Services (INTERNAL),ISD,,,INTERSECTION of Asticou Rd & Washington St Jamaica Plain MA ,12,02,6,E13,Jamaica Plain,11,19,1110,INTERSECTION Asticou Rd & Washington St,,42.3594,-71.0587,Citizens Connect App +91,101004113654,2022-01-01 18:07:52,2022-01-04 08:30:00,2022-01-01 19:07:41,ONTIME,Closed,Case Closed. Closed date : Sat Jan 01 19:07:41 EST 2022 Resolved Belly emptied ,Empty Litter Basket,Public Works Department,Highway Maintenance,Empty Litter Basket,PWDx_District 1B: North End,PWDx,https://311.boston.gov/media/boston/report/photos/61d0debd05bbcf180c29b2c6/report.jpg,,INTERSECTION of Prince St & Causeway St Boston MA ,3,1B,1,A1,Downtown / Financial District,3,3,0302,INTERSECTION Prince St & Causeway St,,42.3594,-71.0587,Citizens Connect App +94,101004113386,2022-01-01 09:23:39,2022-01-10 08:30:00,2022-01-01 12:56:14,ONTIME,Closed,Case Closed. Closed date : Sat Jan 01 12:56:14 EST 2022 Noted Don't believe this is a city park ,Litter / Ground Maintenance - Wellington Green (BPRD),Parks & Recreation Department,Park Maintenance & Safety,Ground Maintenance,PARK_Maintenance_Ground Maintenance,PARK,https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg,,563 Columbus Ave Roxbury MA 02118,4,1C,7,D4,South End,6,Ward 4,0404,563 Columbus Ave,02118,42.3412,-71.0815,Citizens Connect App +95,101004114021,2022-01-02 13:26:36,2022-01-04 08:30:00,2022-01-02 14:49:17,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 14:49:17 EST 2022 Resolved Dead rat picked up ,Pick up Dead Animal,Public Works Department,Street Cleaning,Pick up Dead Animal,PWDx_District 1B: North End,PWDx,https://311.boston.gov/media/boston/report/photos/61d1ee4b05bbcf180c2a2daf/report.jpg,,23 Charter St Boston MA 02113,3,1B,1,A1,Downtown / Financial District,3,Ward 3,0302,23 Charter St,02113,42.3668,-71.0535,Citizens Connect App +99,101004113902,2022-01-02 10:27:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,BTDT_BostonBikes,BTDT,https://311.boston.gov/media/boston/report/photos/61d1c45805bbcf180c2a17ee/report.jpg,,201 Massachusetts Ave Boston MA 02115,4,10A,7,D4,Back Bay,14,04,0405,201 Massachusetts Ave,02115,42.3452,-71.0871,Citizens Connect App +100,101004115118,2022-01-03 16:16:00,2022-01-19 16:16:48,2022-02-28 10:40:30,OVERDUE,Closed,Case Closed. Closed date : 2022-02-28 10:40:30.233 Case Noted Please resubmit with color make and plate number ,Abandoned Vehicles,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Abandoned Vehicles,BTDT_AVRS Interface Queue,BTDT,https://311.boston.gov/media/boston/report/photos/61d367b405bbcf180c2b1f49/report.jpg,,183 Orleans St East Boston MA 02128,1,09,1,A7,East Boston,1,01,0102,183 Orleans St,02128,42.3715,-71.034,Citizens Connect App diff --git a/resources/test/boston311-extdedup-neighborhood.csv b/resources/test/boston311-extdedup-neighborhood.csv new file mode 100644 index 000000000..1406e7051 --- /dev/null +++ b/resources/test/boston311-extdedup-neighborhood.csv @@ -0,0 +1,20 @@ +case_enquiry_id,open_dt,target_dt,closed_dt,ontime,case_status,closure_reason,case_title,subject,reason,type,queue,department,submittedphoto,closedphoto,location,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_street_name,location_zipcode,latitude,longitude,source +101004113637,2022-01-01 17:24:56,2022-01-04 08:30:00,2022-01-03 00:03:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 00:03:27.62 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,353-361 Athens St South Boston MA 02127,6,05,2,C6,South Boston / South Boston Waterfront,5,Ward 6,0604,353-361 Athens St,02127,42.3369,-71.0471,Citizens Connect App +101004114795,2022-01-03 12:29:00,2022-03-07 12:29:41,,OVERDUE,Open, ,Graffiti: Ward 8 0803 ,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP,,,2000A Washington St Roxbury MA 02118,7,10B,7,D4,Roxbury,13,Ward 8,0803,2000A Washington St,02118,42.3333,-71.0797,Constituent Call +101004114783,2022-01-03 12:19:00,2022-01-04 12:19:43,2022-01-03 14:05:26,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 14:05:26.86 Case Resolved Area ticketed ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,32 Mount Vernon St Dorchester MA 02125,6,03,2,C6,Dorchester,5,Ward 7,0709,32 Mount Vernon St,02125,42.322,-71.0573,Constituent Call +101004141367,2022-01-20 08:15:45,2022-01-21 08:30:00,2022-01-20 08:45:12,ONTIME,Closed,Case Closed. Closed date : Thu Jan 20 08:45:12 EST 2022 Noted ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 1B: North End,PWDx,,,12 Derne St Boston MA 02114,3,1B,1,A1,Beacon Hill,3,Ward 3,0306,12 Derne St,02114,42.3596,-71.0634,City Worker App +101004114154,2022-01-02 16:20:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,PWDx_Graffiti,PWDx,,,600 Atlantic Ave Boston MA 02210,3,1C,2,A1,Downtown / Financial District,3,Ward 3,0306,600 Atlantic Ave,02210,42.3527,-71.0536,Citizens Connect App +101004113822,2022-01-02 08:15:00,2022-01-10 08:30:00,2022-01-05 10:37:03,ONTIME,Closed,Case Closed. Closed date : Wed Jan 05 10:37:03 EST 2022 Resolved ,Electrical,Inspectional Services,Building,Electrical,ISD_Building (INTERNAL),ISD,,,156 Everett St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0101,156 Everett St,02128,42.3666,-71.0323,Constituent Call +101004113313,2022-01-01 01:56:00,,,ONTIME,Open, ,Loud Parties/Music/People,Boston Police Department,Noise Disturbance,Loud Parties/Music/People,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,755 Boylston St Boston MA 02116,4,1C,8,D4,Back Bay,14,Ward 5,0508,755 Boylston St,02116,42.3494,-71.0811,Constituent Call +101004114624,2022-01-03 10:12:00,2022-05-03 10:12:36,2022-01-13 14:12:46,ONTIME,Closed,Case Closed. Closed date : Thu Jan 13 14:12:46 EST 2022 Noted Violations found. Notice written. ,SCHEDULED Pest Infestation - Residential,Inspectional Services,Housing,Pest Infestation - Residential,ISD_Housing (INTERNAL),ISD,,,20 Washington St Brighton MA 02135,11,04,9,D14,Allston / Brighton,15,Ward 21,2112,20 Washington St,02135,42.3425,-71.1412,Constituent Call +101004114608,2022-01-03 10:02:57,2022-05-03 10:02:57,2022-01-03 10:11:24,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:11:24.96 Case Invalid ,Pest Infestation - Residential,Inspectional Services,Housing,Pest Infestation - Residential,ISD_Housing (INTERNAL),ISD,,,20 Washington St Hyde Park MA 02136,12,08,5,E18,Hyde Park,10,18,1817,20 Washington St,02136,42.3594,-71.0587,Constituent Call +101004120108,2022-01-08 12:54:49,2022-01-11 08:30:00,2022-01-09 06:43:06,ONTIME,Closed,Case Closed. Closed date : Sun Jan 09 06:43:06 EST 2022 Noted ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 1C: Downtown,PWDx,,,198 W Springfield St Roxbury MA 02118,4,1C,7,D4,South End,6,Ward 9,0902,198 W Springfield St,02118,42.3401,-71.0803,City Worker App +101004113512,2022-01-01 12:43:50,2022-01-31 12:43:50,2022-01-03 10:46:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:46:27.983 Case Noted BTD will investigate. Thank you for contacting 311 and BTD. ,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,New Sign Crosswalk or Pavement Marking,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT,,,43 Oakview Ter Jamaica Plain MA 02130,9,02,6,E13,Jamaica Plain,11,Ward 19,1901,43 Oakview Ter,02130,42.3188,-71.1092,Self Service +101004114807,2022-01-03 12:35:00,,2022-01-10 16:30:33,ONTIME,Closed,Case Closed. Closed date : 2022-01-10 16:30:33.11 Bulk Item Automation ,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,,,21 Ellington St Dorchester MA 02121,7,03,4,B3,Greater Mattapan,13,Ward 14,1403,21 Ellington St,02121,42.3021,-71.0844,Constituent Call +101004113747,2022-01-01 23:46:09,2022-01-17 08:30:00,2022-01-02 11:03:10,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 11:03:10 EST 2022 Noted Case noted. Duplicate case. Posts already marked for contractor to repair. ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,https://311.boston.gov/media/boston/report/photos/61d12e0705bbcf180c29cfc2/report.jpg,,103 N Beacon St Brighton MA 02135,11,04,9,D14,Brighton,15,22,2205,103 N Beacon St,02135,42.3549,-71.143,Citizens Connect App +101004154423,2022-01-31 08:05:00,,,ONTIME,Open, ,Sidewalk Cover / Manhole,Boston Water & Sewer Commission,Sidewalk Cover / Manhole,Sidewalk Cover / Manhole,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,8 Putnam St Charlestown MA 02129,3,1A,1,A15,Charlestown,2,Ward 2,0201,8 Putnam St,02129,42.3735,-71.0599,Constituent Call +101004115093,2022-01-03 16:06:33,,2022-01-04 08:15:58,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 08:15:58.1 Case Invalid This case has been closed as there is not enough information to process this request. If you feel this has been closed in error please dial 311 to submit a new request. Sincerely Boston 311 Team ,City/State Snow Issues,Mayor's 24 Hour Hotline,Programs,City/State Snow Issues,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,40 Battery St Boston MA 02109,3,1B,1,A1,Boston,3,03,0301,40 Battery St,02109,42.3594,-71.0587,Constituent Call +101004114724,2022-01-03 11:36:21,,2022-01-04 16:31:31,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 16:31:31.297 Bulk Item Automation ,Schedule Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup SS,PWDx_Schedule a Bulk Item Pickup,PWDx,,,352 Riverway Boston MA 02115,4,10A,8,B2,Mission Hill,14,Ward 10,1004,352 Riverway,02115,42.3335,-71.1113,Self Service +101004114391,2022-01-03 08:00:00,2022-01-04 08:30:00,2022-01-03 08:36:14,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 08:36:14 EST 2022 Resolved Been removed ,Requests for Street Cleaning,Public Works Department,Street Cleaning,Requests for Street Cleaning,PWDx_District 06: West Roxbury and Roslindale,PWDx,,,2432 Centre St West Roxbury MA 02132,12,06,6,E5,West Roxbury,12,Ward 20,2015,2432 Centre St,02132,42.2674,-71.1626,Constituent Call +101004114656,2022-01-03 10:43:00,2022-01-24 10:43:43,2022-01-18 08:00:19,ONTIME,Closed,Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Request for Recycling Cart,Public Works Department,Recycling,Request for Recycling Cart,PWDx_Recycling Sent to Contractor,PWDx,,,49 Westbourne St Roslindale MA 02131,12,06,5,E5,Roslindale,10,Ward 20,2009,49 Westbourne St,02131,42.2821,-71.1415,Constituent Call +101004143000,2022-01-21 13:47:00,2022-02-04 13:47:30,,OVERDUE,Open, ,BTDT: Complaint,Mayor's 24 Hour Hotline,Employee & General Comments,General Comments For a Program or Policy,BTDT_Parking Enforcement,BTDT,,, , , , , , , , , ,,,42.3594,-71.0587,Constituent Call From fc1e756f0a685f8b3ccd79b48a5dcbe60da080d6 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:43:13 -0400 Subject: [PATCH 4/5] `extdedup`: use itoa for faster integer to string conversion --- src/cmd/extdedup.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index f96f872ba..f0b1abe5f 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -167,7 +167,7 @@ fn dedup_csv(args: Args, mem_limited_buffer: u64) -> Result Date: Sat, 12 Oct 2024 20:22:00 -0400 Subject: [PATCH 5/5] `docs`: `extdedup` has selector support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ce8b7577..573d23ac5 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ | [excel](/src/cmd/excel.rs#L2)
🚀 | Exports a specified Excel/ODS sheet to a CSV file. | | [exclude](/src/cmd/exclude.rs#L2)
📇👆 | Removes a set of CSV data from another set based on the specified columns. | | [explode](/src/cmd/explode.rs#L2)
🔣👆 | Explode rows into multiple ones by splitting a column value based on the given separator. | -| [extdedup](/src/cmd/extdedup.rs#L2)
| Remove duplicate rows from an arbitrarily large CSV/text file using a memory-mapped, [on-disk hash table](https://crates.io/crates/odht). Unlike the `dedup` command, this command does not load the entire file into memory nor does it sort the deduped file. | +| [extdedup](/src/cmd/extdedup.rs#L2)
👆 | Remove duplicate rows from an arbitrarily large CSV/text file using a memory-mapped, [on-disk hash table](https://crates.io/crates/odht). Unlike the `dedup` command, this command does not load the entire file into memory nor does it sort the deduped file. | | [extsort](/src/cmd/extsort.rs#L2)
🚀 | Sort an arbitrarily large CSV/text file using a multithreaded [external merge sort](https://en.wikipedia.org/wiki/External_sorting) algorithm. | | [fetch](/src/cmd/fetch.rs#L3)
✨🧠🌐 | Fetches data from web services for every row using **HTTP Get**. Comes with [HTTP/2](https://http2-explained.haxx.se/en/part1) [adaptive flow control](https://medium.com/coderscorner/http-2-flow-control-77e54f7fd518), [jql](https://github.com/yamafaktory/jql#%EF%B8%8F-usage) JSON query language support, dynamic throttling ([RateLimit](https://www.ietf.org/archive/id/draft-ietf-httpapi-ratelimit-headers-06.html)) & caching with available persistent caching using [Redis](https://redis.io/) or a disk-cache. | | [fetchpost](/src/cmd/fetchpost.rs#L3)
✨🧠🌐 | Similar to `fetch`, but uses **HTTP Post**. ([HTTP GET vs POST methods](https://www.geeksforgeeks.org/difference-between-http-get-and-post-methods/)) |