Merge pull request #1531 from jqnatividad/reverse_indexed

`reverse`: now has index support and can work in "streaming" mode
dathere · Jan 7, 2024 · 405d7a1 · 405d7a1
2 parents 5d4bc9c + 4e0b943
commit 405d7a1
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@
 | [py](/src/cmd/python.rs#L2)<br>✨🔣 | Create a new computed column or filter rows by evaluating a python expression on every row of a CSV file. Python's [f-strings](https://www.freecodecamp.org/news/python-f-strings-tutorial-how-to-use-f-strings-for-string-formatting/) is particularly useful for extended formatting, [with the ability to evaluate Python expressions as well](https://github.com/jqnatividad/qsv/blob/4cd00dca88addf0d287247fa27d40563b6d46985/src/cmd/python.rs#L23-L31). |
 | [rename](/src/cmd/rename.rs#L2) |  Rename the columns of a CSV efficiently. |
 | [replace](/src/cmd/replace.rs#L2) | Replace CSV data using a regex. Applies the regex to each field individually. |
-| [reverse](/src/cmd/reverse.rs#L2)<br>🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key.  |
+| [reverse](/src/cmd/reverse.rs#L2)<br>📇🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. If an index is present, it works with constant memory. Otherwise, it will load all the data into memory. |
 | <a name="safenames_deeplink"></a>[safenames](/src/cmd/safenames.rs#L2)<br>![CKAN](docs/images/ckan.png) | Modify headers of a CSV to only have ["safe" names](/src/cmd/safenames.rs#L5-L14) - guaranteed "database-ready"/"CKAN-ready" names.  |
 | [sample](/src/cmd/sample.rs#L2)<br>📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. |
 | [schema](/src/cmd/schema.rs#L2)<br>📇😣🏎️ | Infer schema from CSV data, replete with data type & domain/range validation & output in [JSON Schema](https://json-schema.org/) format. Uses multithreading to go faster if an index is present. See `validate` command to use the generated JSON Schema to validate if similar CSVs comply with the schema. |

diff --git a/src/cmd/reverse.rs b/src/cmd/reverse.rs
@@ -4,7 +4,8 @@ Reverses rows of CSV data.
 Useful for cases when there is no column that can be used for sorting in reverse order,
 or when keys are not unique and order of rows with the same key needs to be preserved.
 
-Note that this requires reading all of the CSV data into memory.
+Note that if the CSV is not indexed, this operation will require reading all of the
+CSV data into memory
 
 Usage:
     qsv reverse [options] [<input>]
@@ -46,19 +47,36 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         .no_headers(args.flag_no_headers);
 
     let mut rdr = rconfig.reader()?;
+    let mut wtr = Config::new(&args.flag_output).writer()?;
 
-    // we're loading the entire file into memory, we need to check avail mem
-    if let Some(path) = rconfig.path.clone() {
-        util::mem_file_check(&path, false, args.flag_memcheck)?;
-    }
+    let Some(mut idx_file) = rconfig.indexed()? else {
+        // we don't have an index, we need to read the entire file into memory
+        // we're loading the entire file into memory, we need to check avail mem
+        if let Some(path) = rconfig.path.clone() {
+            util::mem_file_check(&path, false, args.flag_memcheck)?;
+        }
 
-    let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
-    all.reverse();
+        let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
+        all.reverse();
 
-    let mut wtr = Config::new(&args.flag_output).writer()?;
+        rconfig.write_headers(&mut rdr, &mut wtr)?;
+        for r in all {
+            wtr.write_byte_record(&r)?;
+        }
+        return Ok(wtr.flush()?);
+    };
+
+    // we have an index, no need to check avail mem,
+    // we're reading the file in reverse streaming
     rconfig.write_headers(&mut rdr, &mut wtr)?;
-    for r in all {
-        wtr.write_byte_record(&r)?;
+    let mut record = csv::ByteRecord::new();
+    let mut pos = idx_file.count().saturating_sub(1);
+    idx_file.seek(pos)?;
+    while idx_file.read_byte_record(&mut record)? {
+        wtr.write_byte_record(&record)?;
+        pos -= 1;
+        idx_file.seek(pos)?; // seek to next pos
     }
+
     Ok(wtr.flush()?)
 }
diff --git a/tests/test_reverse.rs b/tests/test_reverse.rs
@@ -39,3 +39,43 @@ fn prop_reverse_no_headers() {
     }
     qcheck(p as fn(CsvData) -> bool);
 }
+
+fn prop_reverse_indexed(name: &str, rows: CsvData, headers: bool) -> bool {
+    let wrk = Workdir::new(name);
+    wrk.create_indexed("in.csv", rows.clone());
+
+    let mut cmd = wrk.command("reverse");
+    cmd.arg("in.csv");
+    if !headers {
+        cmd.arg("--no-headers");
+    }
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let mut expected = rows.to_vecs();
+    let headers = if headers && !expected.is_empty() {
+        expected.remove(0)
+    } else {
+        vec![]
+    };
+    expected.reverse();
+    if !headers.is_empty() {
+        expected.insert(0, headers);
+    }
+    rassert_eq!(got, expected)
+}
+
+#[test]
+fn prop_reverse_headers_indexed() {
+    fn p(rows: CsvData) -> bool {
+        prop_reverse_indexed("prop_reverse_headers_indexed", rows, true)
+    }
+    qcheck(p as fn(CsvData) -> bool);
+}
+
+#[test]
+fn prop_reverse_no_headers_indexed() {
+    fn p(rows: CsvData) -> bool {
+        prop_reverse_indexed("prop_reverse_no_headers_indexed", rows, false)
+    }
+    qcheck(p as fn(CsvData) -> bool);
+}