diff --git a/Cargo.lock b/Cargo.lock index e8714c13..f7207b6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,3 +1,5 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. [[package]] name = "aho-corasick" version = "0.6.6" diff --git a/src/cmd/exclude.rs b/src/cmd/exclude.rs new file mode 100644 index 00000000..3868e9c8 --- /dev/null +++ b/src/cmd/exclude.rs @@ -0,0 +1,267 @@ +use std::collections::hash_map::{Entry, HashMap}; +use std::fmt; +use std::fs; +use std::io; +use std::str; + +use byteorder::{BigEndian, WriteBytesExt}; +use csv; + +use config::{Config, Delimiter}; +use index::Indexed; +use select::{SelectColumns, Selection}; +use util; +use CliResult; + +static USAGE: &'static str = " +Removes a set of CSV data from another set based on the specified columns. + +Also can compute the intersection of two CSV sets with the -v flag. + +Matching is always done by ignoring leading and trailing whitespace. By default, +matching is done case sensitively, but this can be disabled with the --no-case +flag. + +The columns arguments specify the columns to match for each input. Columns can +be referenced by name or index, starting at 1. Specify multiple columns by +separating them with a comma. Specify a range of columns with `-`. Both +columns1 and columns2 must specify exactly the same number of columns. +(See 'xsv select --help' for the full syntax.) + +Usage: + xsv exclude [options] + xsv exclude --help + +join options: + --no-case When set, matching is done case insensitively. + -v When set, matching rows will be the only ones included, + forming set intersection, instead of the ones discarded. + +Common options: + -h, --help Display this message + -o, --output Write output to instead of stdout. + -n, --no-headers When set, the first row will not be interpreted + as headers. (i.e., They are not searched, analyzed, + sliced, etc.) + -d, --delimiter The field delimiter for reading CSV data. + Must be a single character. (default: ,) +"; + +type ByteString = Vec; + +#[derive(Deserialize)] +struct Args { + arg_columns1: SelectColumns, + arg_input1: String, + arg_columns2: SelectColumns, + arg_input2: String, + flag_v: bool, + flag_output: Option, + flag_no_headers: bool, + flag_no_case: bool, + flag_delimiter: Option, +} + +pub fn run(argv: &[&str]) -> CliResult<()> { + let args: Args = util::get_args(USAGE, argv)?; + let mut state = args.new_io_state()?; + state.write_headers()?; + state.exclude(args.flag_v) +} + +struct IoState { + wtr: csv::Writer, + rdr1: csv::Reader, + sel1: Selection, + rdr2: csv::Reader, + sel2: Selection, + no_headers: bool, + casei: bool, +} + +impl IoState { + fn write_headers(&mut self) -> CliResult<()> { + if !self.no_headers { + let mut headers = self.rdr1.byte_headers()?.clone(); + self.wtr.write_record(&headers)?; + } + Ok(()) + } + + fn exclude(mut self, invert: bool) -> CliResult<()> { + let mut scratch = csv::ByteRecord::new(); + let mut validx = ValueIndex::new(self.rdr2, &self.sel2, self.casei)?; + for row in self.rdr1.byte_records() { + let row = row?; + let key = get_row_key(&self.sel1, &row, self.casei); + match validx.values.get(&key) { + None => { + if !invert { + self.wtr.write_record(row.iter())?; + } else { + continue; + } + } + Some(rows) => { + if invert { + self.wtr.write_record(row.iter())?; + } else { + continue; + } + } + } + } + Ok(()) + } +} + +impl Args { + fn new_io_state(&self) -> CliResult>> { + let rconf1 = Config::new(&Some(self.arg_input1.clone())) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.arg_columns1.clone()); + let rconf2 = Config::new(&Some(self.arg_input2.clone())) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.arg_columns2.clone()); + + let mut rdr1 = rconf1.reader_file()?; + let mut rdr2 = rconf2.reader_file()?; + let (sel1, sel2) = self.get_selections(&rconf1, &mut rdr1, &rconf2, &mut rdr2)?; + Ok(IoState { + wtr: Config::new(&self.flag_output).writer()?, + rdr1: rdr1, + sel1: sel1, + rdr2: rdr2, + sel2: sel2, + no_headers: rconf1.no_headers, + casei: self.flag_no_case, + }) + } + + fn get_selections( + &self, + rconf1: &Config, + rdr1: &mut csv::Reader, + rconf2: &Config, + rdr2: &mut csv::Reader, + ) -> CliResult<(Selection, Selection)> { + let headers1 = rdr1.byte_headers()?; + let headers2 = rdr2.byte_headers()?; + let select1 = rconf1.selection(&*headers1)?; + let select2 = rconf2.selection(&*headers2)?; + if select1.len() != select2.len() { + return fail!(format!( + "Column selections must have the same number of columns, \ + but found column selections with {} and {} columns.", + select1.len(), + select2.len() + )); + } + Ok((select1, select2)) + } +} + +struct ValueIndex { + // This maps tuples of values to corresponding rows. + values: HashMap, Vec>, + idx: Indexed>>, + num_rows: usize, +} + +impl ValueIndex { + fn new(mut rdr: csv::Reader, sel: &Selection, casei: bool) -> CliResult> { + let mut val_idx = HashMap::with_capacity(10000); + let mut row_idx = io::Cursor::new(Vec::with_capacity(8 * 10000)); + let (mut rowi, mut count) = (0usize, 0usize); + + // This logic is kind of tricky. Basically, we want to include + // the header row in the line index (because that's what csv::index + // does), but we don't want to include header values in the ValueIndex. + if !rdr.has_headers() { + // ... so if there are no headers, we seek to the beginning and + // index everything. + let mut pos = csv::Position::new(); + pos.set_byte(0); + rdr.seek(pos)?; + } else { + // ... and if there are headers, we make sure that we've parsed + // them, and write the offset of the header row to the index. + rdr.byte_headers()?; + row_idx.write_u64::(0)?; + count += 1; + } + + let mut row = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut row)? { + // This is a bit hokey. We're doing this manually instead of using + // the `csv-index` crate directly so that we can create both + // indexes in one pass. + row_idx.write_u64::(row.position().unwrap().byte())?; + + let fields: Vec<_> = sel.select(&row).map(|v| transform(v, casei)).collect(); + if !fields.iter().any(|f| f.is_empty()) { + match val_idx.entry(fields) { + Entry::Vacant(v) => { + let mut rows = Vec::with_capacity(4); + rows.push(rowi); + v.insert(rows); + } + Entry::Occupied(mut v) => { + v.get_mut().push(rowi); + } + } + } + rowi += 1; + count += 1; + } + + row_idx.write_u64::(count as u64)?; + let idx = Indexed::open(rdr, io::Cursor::new(row_idx.into_inner()))?; + Ok(ValueIndex { + values: val_idx, + idx: idx, + num_rows: rowi, + }) + } +} + +impl fmt::Debug for ValueIndex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Sort the values by order of first appearance. + let mut kvs = self.values.iter().collect::>(); + kvs.sort_by(|&(_, v1), &(_, v2)| v1[0].cmp(&v2[0])); + for (keys, rows) in kvs.into_iter() { + // This is just for debugging, so assume Unicode for now. + let keys = keys + .iter() + .map(|k| String::from_utf8(k.to_vec()).unwrap()) + .collect::>(); + writeln!(f, "({}) => {:?}", keys.join(", "), rows)? + } + Ok(()) + } +} + +fn get_row_key(sel: &Selection, row: &csv::ByteRecord, casei: bool) -> Vec { + sel.select(row).map(|v| transform(&v, casei)).collect() +} + +fn transform(bs: &[u8], casei: bool) -> ByteString { + match str::from_utf8(bs) { + Err(_) => bs.to_vec(), + Ok(s) => { + if !casei { + s.trim().as_bytes().to_vec() + } else { + let norm: String = s + .trim() + .chars() + .map(|c| c.to_lowercase().next().unwrap()) + .collect(); + norm.into_bytes() + } + } + } +} diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 921ad00b..3dea92d5 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -1,5 +1,6 @@ pub mod cat; pub mod count; +pub mod exclude; pub mod fixlengths; pub mod flatten; pub mod fmt; diff --git a/src/main.rs b/src/main.rs index 0f9cacfe..1cda8bc8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -45,6 +45,7 @@ macro_rules! command_list { " cat Concatenate by row or column count Count records + exclude Excludes the records in one CSV from another fixlengths Makes all records have same length flatten Show one field per line fmt Format CSV output (change field delimiter) @@ -142,6 +143,7 @@ Please choose one of the following commands:", enum Command { Cat, Count, + Exclude, FixLengths, Flatten, Fmt, @@ -177,6 +179,7 @@ impl Command { match self { Command::Cat => cmd::cat::run(argv), Command::Count => cmd::count::run(argv), + Command::Exclude => cmd::exclude::run(argv), Command::FixLengths => cmd::fixlengths::run(argv), Command::Flatten => cmd::flatten::run(argv), Command::Fmt => cmd::fmt::run(argv), diff --git a/tests/test_exclude.rs b/tests/test_exclude.rs new file mode 100644 index 00000000..27172359 --- /dev/null +++ b/tests/test_exclude.rs @@ -0,0 +1,87 @@ +use workdir::Workdir; + +// This macro takes *two* identifiers: one for the test with headers +// and another for the test without headers. +macro_rules! exclude_test { + ($name:ident, $fun:expr) => { + mod $name { + use std::process; + + use super::{make_rows, setup}; + use workdir::Workdir; + + #[test] + fn headers() { + let wrk = setup(stringify!($name), true); + let mut cmd = wrk.command("exclude"); + cmd.args(&["city", "cities.csv", "city", "places.csv"]); + $fun(wrk, cmd, true); + } + + #[test] + fn no_headers() { + let n = stringify!(concat_idents!($name, _no_headers)); + let wrk = setup(n, false); + let mut cmd = wrk.command("exclude"); + cmd.arg("--no-headers"); + cmd.args(&["1", "cities.csv", "1", "places.csv"]); + $fun(wrk, cmd, false); + } + } + }; +} + +fn setup(name: &str, headers: bool) -> Workdir { + let mut cities = vec![ + svec!["Boston", "MA"], + svec!["New York", "NY"], + svec!["San Francisco", "CA"], + svec!["Buffalo", "NY"], + ]; + let mut places = vec![ + svec!["Boston", "Logan Airport"], + svec!["Boston", "Boston Garden"], + svec!["Buffalo", "Ralph Wilson Stadium"], + svec!["Orlando", "Disney World"], + ]; + if headers { + cities.insert(0, svec!["city", "state"]); + } + if headers { + places.insert(0, svec!["city", "place"]); + } + + let wrk = Workdir::new(name); + wrk.create("cities.csv", cities); + wrk.create("places.csv", places); + wrk +} + +fn make_rows(headers: bool, rows: Vec>) -> Vec> { + let mut all_rows = vec![]; + if headers { + all_rows.push(svec!["city", "state"]); + } + all_rows.extend(rows.into_iter()); + all_rows +} + +exclude_test!(exclude, |wrk: Workdir, + mut cmd: process::Command, + headers: bool| { + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = make_rows( + headers, + vec![svec!["New York", "NY"], svec!["San Francisco", "CA"]], + ); + assert_eq!(got, expected); +}); + +exclude_test!(include, |wrk: Workdir, + mut cmd: process::Command, + headers: bool| { + cmd.arg("-v"); + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = make_rows(headers, vec![svec!["Boston", "MA"], svec!["Buffalo", "NY"]]); + assert_eq!(got, expected); +}); diff --git a/tests/tests.rs b/tests/tests.rs index 37966bc7..a21f7605 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -35,6 +35,7 @@ mod workdir; mod test_cat; mod test_count; +mod test_exclude; mod test_fixlengths; mod test_flatten; mod test_fmt;