From 8c4fd7b9a010426b81aef9df0f82e2fd4a6521cb Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Sun, 7 Jan 2024 09:00:59 -0500
Subject: [PATCH 1/3] `reverse`: add index support
with an index, reverse becomes streaming, able to reverse CSV files of arbitrary length
---
src/cmd/reverse.rs | 38 ++++++++++++++++++++++++++++----------
1 file changed, 28 insertions(+), 10 deletions(-)
diff --git a/src/cmd/reverse.rs b/src/cmd/reverse.rs
index d879c1090..4a511177e 100644
--- a/src/cmd/reverse.rs
+++ b/src/cmd/reverse.rs
@@ -4,7 +4,8 @@ Reverses rows of CSV data.
Useful for cases when there is no column that can be used for sorting in reverse order,
or when keys are not unique and order of rows with the same key needs to be preserved.
-Note that this requires reading all of the CSV data into memory.
+Note that if the CSV is not indexed, this operation will require reading all of the
+CSV data into memory
Usage:
qsv reverse [options] []
@@ -46,19 +47,36 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
.no_headers(args.flag_no_headers);
let mut rdr = rconfig.reader()?;
+ let mut wtr = Config::new(&args.flag_output).writer()?;
- // we're loading the entire file into memory, we need to check avail mem
- if let Some(path) = rconfig.path.clone() {
- util::mem_file_check(&path, false, args.flag_memcheck)?;
- }
+ let Some(mut idx_file) = rconfig.indexed()? else {
+ // we don't have an index, we need to read the entire file into memory
+ // we're loading the entire file into memory, we need to check avail mem
+ if let Some(path) = rconfig.path.clone() {
+ util::mem_file_check(&path, false, args.flag_memcheck)?;
+ }
- let mut all = rdr.byte_records().collect::, _>>()?;
- all.reverse();
+ let mut all = rdr.byte_records().collect::, _>>()?;
+ all.reverse();
- let mut wtr = Config::new(&args.flag_output).writer()?;
+ rconfig.write_headers(&mut rdr, &mut wtr)?;
+ for r in all {
+ wtr.write_byte_record(&r)?;
+ }
+ return Ok(wtr.flush()?);
+ };
+
+ // we have an index, no need to check avail mem,
+ // we're reading the file in reverse streaming
rconfig.write_headers(&mut rdr, &mut wtr)?;
- for r in all {
- wtr.write_byte_record(&r)?;
+ let mut record = csv::ByteRecord::new();
+ let mut pos = idx_file.count().saturating_sub(1);
+ idx_file.seek(pos)?;
+ while idx_file.read_byte_record(&mut record)? {
+ wtr.write_byte_record(&record)?;
+ pos -= 1;
+ idx_file.seek(pos)?; // seek to next pos
}
+
Ok(wtr.flush()?)
}
From 3fea1d896b7c7e88896565c9e33231b37b901a72 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Sun, 7 Jan 2024 09:01:25 -0500
Subject: [PATCH 2/3] `tests`: add property tests for reverse indexed mode
---
tests/test_reverse.rs | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/tests/test_reverse.rs b/tests/test_reverse.rs
index 11c023a2b..9ad165e1b 100644
--- a/tests/test_reverse.rs
+++ b/tests/test_reverse.rs
@@ -39,3 +39,43 @@ fn prop_reverse_no_headers() {
}
qcheck(p as fn(CsvData) -> bool);
}
+
+fn prop_reverse_indexed(name: &str, rows: CsvData, headers: bool) -> bool {
+ let wrk = Workdir::new(name);
+ wrk.create_indexed("in.csv", rows.clone());
+
+ let mut cmd = wrk.command("reverse");
+ cmd.arg("in.csv");
+ if !headers {
+ cmd.arg("--no-headers");
+ }
+
+ let got: Vec> = wrk.read_stdout(&mut cmd);
+ let mut expected = rows.to_vecs();
+ let headers = if headers && !expected.is_empty() {
+ expected.remove(0)
+ } else {
+ vec![]
+ };
+ expected.reverse();
+ if !headers.is_empty() {
+ expected.insert(0, headers);
+ }
+ rassert_eq!(got, expected)
+}
+
+#[test]
+fn prop_reverse_headers_indexed() {
+ fn p(rows: CsvData) -> bool {
+ prop_reverse_indexed("prop_reverse_headers_indexed", rows, true)
+ }
+ qcheck(p as fn(CsvData) -> bool);
+}
+
+#[test]
+fn prop_reverse_no_headers_indexed() {
+ fn p(rows: CsvData) -> bool {
+ prop_reverse_indexed("prop_reverse_no_headers_indexed", rows, false)
+ }
+ qcheck(p as fn(CsvData) -> bool);
+}
From 4e0b943a25ca68d7743c8661c0c0ae75f6bd4d81 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Sun, 7 Jan 2024 09:09:06 -0500
Subject: [PATCH 3/3] `readme`: `reverse` now has index support and can work in
streaming mode
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index b0f3e1a08..9457145dc 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@
| [py](/src/cmd/python.rs#L2) ✨🔣 | Create a new computed column or filter rows by evaluating a python expression on every row of a CSV file. Python's [f-strings](https://www.freecodecamp.org/news/python-f-strings-tutorial-how-to-use-f-strings-for-string-formatting/) is particularly useful for extended formatting, [with the ability to evaluate Python expressions as well](https://github.com/jqnatividad/qsv/blob/4cd00dca88addf0d287247fa27d40563b6d46985/src/cmd/python.rs#L23-L31). |
| [rename](/src/cmd/rename.rs#L2) | Rename the columns of a CSV efficiently. |
| [replace](/src/cmd/replace.rs#L2) | Replace CSV data using a regex. Applies the regex to each field individually. |
-| [reverse](/src/cmd/reverse.rs#L2) 🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. |
+| [reverse](/src/cmd/reverse.rs#L2) 📇🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. If an index is present, it works with constant memory. Otherwise, it will load all the data into memory. |
| [safenames](/src/cmd/safenames.rs#L2) ![CKAN](docs/images/ckan.png) | Modify headers of a CSV to only have ["safe" names](/src/cmd/safenames.rs#L5-L14) - guaranteed "database-ready"/"CKAN-ready" names. |
| [sample](/src/cmd/sample.rs#L2) 📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. |
| [schema](/src/cmd/schema.rs#L2) 📇😣🏎️ | Infer schema from CSV data, replete with data type & domain/range validation & output in [JSON Schema](https://json-schema.org/) format. Uses multithreading to go faster if an index is present. See `validate` command to use the generated JSON Schema to validate if similar CSVs comply with the schema. |