diff --git a/Cargo.lock b/Cargo.lock index 3db03f441..97103970b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2616,9 +2616,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.151" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "libduckdb-sys" @@ -4040,7 +4040,6 @@ dependencies = [ "dynfmt", "eudex", "ext-sort", - "fastrand 2.0.1", "file-format", "filetime", "flate2", diff --git a/Cargo.toml b/Cargo.toml index 3166c6f07..fabc91692 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,7 +99,6 @@ eudex = { version = "0.1", optional = true } ext-sort = { version = "0.1", features = [ "memory-limit", ], default-features = false } -fastrand = "2" flate2 = { version = "1", optional = true } file-format = { version = "0.23", features = ["reader"] } filetime = "0.2" diff --git a/src/cmd/sort.rs b/src/cmd/sort.rs index 58834c9ff..5a37c2443 100644 --- a/src/cmd/sort.rs +++ b/src/cmd/sort.rs @@ -20,11 +20,24 @@ sort options: See 'qsv select --help' for the format details. -N, --numeric Compare according to string numerical value -R, --reverse Reverse order - --random Random order - --seed Random number generator seed to use if --random is set -i, --ignore-case Compare strings disregarding case -u, --unique When set, identical consecutive lines will be dropped to keep only one line per sorted value. + + --random Random order + --seed Random Number Generator (RNG) seed to use if --random is set + --rng The RNG algorithm to use if --random is set. + Three RNGs are supported: + - standard: Use the standard RNG. + 1.5 GB/s throughput. + - faster: Use faster RNG using the Xoshiro256Plus algorithm. + 8 GB/s throughput. + - cryptosecure: Use cryptographically secure HC128 algorithm. + Recommended by eSTREAM (https://www.ecrypt.eu.org/stream/). + 2.1 GB/s throughput though slow initialization. + [default: standard] + + -j, --jobs The number of jobs to run in parallel. When not set, the number of jobs is set to the number of CPUs detected. @@ -36,11 +49,6 @@ sort options: which is useful for sorting large files that will otherwise NOT fit in memory using the default allocating stable sort. - - For --random sorts, this means using an alternative - random number generator (RNG) that uses the faster - Wyrand algorithm instead of the ChaCha algorithm used - by the standard RNG. Common options: -h, --help Display this message @@ -55,13 +63,16 @@ Common options: CSV into memory using CONSERVATIVE heuristics. "#; -use std::cmp; +use std::{cmp, str::FromStr}; -use fastrand; //DevSkim: ignore DS148264 -use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; +// use fastrand; //DevSkim: ignore DS148264 +use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; +use rand_hc::Hc128Rng; +use rand_xoshiro::Xoshiro256Plus; use rayon::slice::ParallelSliceMut; use serde::Deserialize; use simdutf8::basic::from_utf8; +use strum_macros::EnumString; use self::Number::{Float, Int}; use crate::{ @@ -77,18 +88,27 @@ struct Args { flag_select: SelectColumns, flag_numeric: bool, flag_reverse: bool, + flag_ignore_case: bool, + flag_unique: bool, flag_random: bool, flag_seed: Option, - flag_ignore_case: bool, + flag_rng: String, flag_jobs: Option, flag_faster: bool, flag_output: Option, flag_no_headers: bool, flag_delimiter: Option, - flag_unique: bool, flag_memcheck: bool, } +#[derive(Debug, EnumString, PartialEq)] +#[strum(ascii_case_insensitive)] +enum RngKind { + Standard, + Faster, + Cryptosecure, +} + pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; let numeric = args.flag_numeric; @@ -100,6 +120,13 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .no_headers(args.flag_no_headers) .select(args.flag_select); + let Ok(rng_kind) = RngKind::from_str(&args.flag_rng) else { + return fail_incorrectusage_clierror!( + "Invalid RNG algorithm `{}`. Supported RNGs are: standard, faster, cryptosecure.", + args.flag_rng + ); + }; + // we're loading the entire file into memory, we need to check avail memory if let Some(path) = rconfig.path.clone() { // we only check if we're doing a stable sort and its not --random @@ -124,25 +151,61 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut all = rdr.byte_records().collect::, _>>()?; match (numeric, reverse, random, faster) { - // --random stable sort - (_, _, true, false) => { - // we don't need cryptographically strong RNGs for this - // add DevSkim lint ignores to suppress warning - if let Some(val) = seed { - let mut rng = StdRng::seed_from_u64(val); //DevSkim: ignore DS148264 - SliceRandom::shuffle(&mut *all, &mut rng); //DevSkim: ignore DS148264 - } else { - let mut rng = ::rand::thread_rng(); - SliceRandom::shuffle(&mut *all, &mut rng); //DevSkim: ignore DS148264 - } - }, - // --random --faster stable sort - (_, _, true, true) => { - // faster random sorts using Wyrand - if let Some(val) = seed { - fastrand::seed(val); //DevSkim: ignore DS148264 + // --random sort + (_, _, true, _) => { + match rng_kind { + RngKind::Standard => { + if let Some(val) = seed { + let mut rng = StdRng::seed_from_u64(val); //DevSkim: ignore DS148264 + all.shuffle(&mut rng); //DevSkim: ignore DS148264 + } else { + let mut rng = ::rand::thread_rng(); + all.shuffle(&mut rng); //DevSkim: ignore DS148264 + } + }, + RngKind::Faster => { + let mut rng = match args.flag_seed { + None => Xoshiro256Plus::from_rng(rand::thread_rng()).unwrap(), + Some(seed) => Xoshiro256Plus::seed_from_u64(seed), /* DevSkim: ignore + * DS148264 */ + }; + SliceRandom::shuffle(&mut *all, &mut rng); //DevSkim: ignore DS148264 + }, + RngKind::Cryptosecure => { + let seed_32 = match args.flag_seed { + None => rand::thread_rng().gen::<[u8; 32]>(), + Some(seed) => { + let seed_u8 = seed.to_le_bytes(); + let mut seed_32 = [0u8; 32]; + seed_32[..8].copy_from_slice(&seed_u8); + seed_32 + }, + }; + let mut rng: Hc128Rng = match args.flag_seed { + None => Hc128Rng::from_rng(rand::thread_rng()).unwrap(), + Some(_) => Hc128Rng::from_seed(seed_32), + }; + SliceRandom::shuffle(&mut *all, &mut rng); + }, } - fastrand::shuffle(&mut all); //DevSkim: ignore DS148264 + + // // we don't need cryptographically strong RNGs for this + // // add DevSkim lint ignores to suppress warning + // if let Some(val) = seed { + // let mut rng = StdRng::seed_from_u64(val); //DevSkim: ignore DS148264 + // SliceRandom::shuffle(&mut *all, &mut rng); //DevSkim: ignore DS148264 + // } else { + // let mut rng = ::rand::thread_rng(); + // SliceRandom::shuffle(&mut *all, &mut rng); //DevSkim: ignore DS148264 + // } + // }, + // // --random --faster stable sort + // (_, _, true, true) => { + // // faster random sorts using Wyrand + // if let Some(val) = seed { + // fastrand::seed(val); //DevSkim: ignore DS148264 + // } + // fastrand::shuffle(&mut all); //DevSkim: ignore DS148264 }, // default stable parallel sort diff --git a/tests/test_sort.rs b/tests/test_sort.rs index 82eacaa6d..605171fd5 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -465,18 +465,53 @@ fn sort_random_faster() { let mut cmd = wrk.command("sort"); cmd.arg("--random") .args(["--seed", "42"]) - .arg("--faster") + .args(["--rng", "faster"]) .arg("in.csv"); let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ svec!["R", "S"], + svec!["5", "f"], + svec!["3", "d"], + svec!["4", "c"], + svec!["6", "e"], + svec!["2", "a"], svec!["1", "b"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn sort_random_secure() { + let wrk = Workdir::new("sort_random_secure"); + wrk.create( + "in.csv", + vec![ + svec!["R", "S"], + svec!["1", "b"], + svec!["2", "a"], + svec!["3", "d"], + svec!["4", "c"], + svec!["5", "f"], + svec!["6", "e"], + ], + ); + + let mut cmd = wrk.command("sort"); + cmd.arg("--random") + .args(["--seed", "42"]) + .args(["--rng", "cryptosecure"]) + .arg("in.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["R", "S"], + svec!["3", "d"], + svec!["5", "f"], svec!["2", "a"], svec!["6", "e"], svec!["4", "c"], - svec!["5", "f"], - svec!["3", "d"], + svec!["1", "b"], ]; assert_eq!(got, expected); }