From 35e09aac3465a926a7c52a304ac26b7ed2e62eeb Mon Sep 17 00:00:00 2001 From: kennytm Date: Sun, 14 Apr 2019 00:58:59 +0800 Subject: [PATCH] Support compression. --- CLI.md | 19 ++++++ Cargo.toml | 5 +- src/cli.rs | 86 +++++++++++++++++++++++++- tests/data/compress/flags.json | 6 ++ tests/data/compress/result-schema.sql | 1 + tests/data/compress/result.1.sql.xz | Bin 0 -> 392 bytes tests/data/compress/template.sql | 1 + 7 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 tests/data/compress/flags.json create mode 100644 tests/data/compress/result-schema.sql create mode 100644 tests/data/compress/result.1.sql.xz create mode 100644 tests/data/compress/template.sql diff --git a/CLI.md b/CLI.md index 1c88757..b33a0f1 100644 --- a/CLI.md +++ b/CLI.md @@ -113,11 +113,30 @@ More options * `sql` * `csv` +* `-c «ALG»`, `--compress «ALG»` / `--compress-level «LEVEL»` + + Compress the data output. Possible algorithms are: + + | Algorithm | Levels | + |-----------|--------| + | [gzip] | 0–9 | + | [xz] | 0–9 | + | [zstd] | 1–21 | + + The compression level defaults to 6 if not specified. + + Since the data are randomly generated, the compression ratio is typically not very high (around + 70% of uncompressed input). We do not recommend using the algorithm "xz" here, nor using very + high compression levels. + [ChaCha20]: https://cr.yp.to/chacha.html [HC-128]: https://www.ntu.edu.sg/home/wuhj/research/hc/index.html [ISAAC]: http://www.burtleburtle.net/bob/rand/isaacafa.html [Xorshift]: https://en.wikipedia.org/wiki/Xorshift [PCG32]: http://www.pcg-random.org/ +[gzip]: https://en.wikipedia.org/wiki/Gzip +[xz]: https://en.wikipedia.org/wiki/Xz +[zstd]: https://facebook.github.io/zstd/ [`NO_BACKSLASH_ESCAPES`]: https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sqlmode_no_backslash_escapes [`standard_conforming_strings`]: https://www.postgresql.org/docs/current/static/runtime-config-compatible.html#GUC-STANDARD-CONFORMING-STRINGS \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 7b471d8..b99bb72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ num-traits = "0.2" rayon = "1.0" zipf = "5.0" chrono = { version = "0.4", default-features = false } -chrono-tz = { version = "0.5", features = ["serde"] } +chrono-tz = { version = "0.5.1", features = ["serde"] } ryu = "0.2" serde_derive = "1.0" serde = "1.0" @@ -42,6 +42,9 @@ rand_chacha = "0.1" rand_hc = "0.1" rand_xorshift = "0.1" shlex = "0.1" +flate2 = "1.0" +xz2 = "0.1" +zstd = { version = "0.4", default-features = false } [dev-dependencies] regex = "1.1" diff --git a/src/cli.rs b/src/cli.rs index b51f087..bf9ff46 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -9,6 +9,7 @@ use crate::{ use chrono_tz::Tz; use data_encoding::{DecodeError, DecodeKind, HEXLOWER_PERMISSIVE}; use failure::{Error, Fail, ResultExt}; +use flate2::write::GzEncoder; use muldiv::MulDiv; use pbr::{MultiBar, Units}; use rand::{ @@ -30,6 +31,7 @@ use std::{ time::Duration, }; use structopt::StructOpt; +use xz2::write::XzEncoder; /// Arguments to the `dbgen` CLI program. #[derive(StructOpt, Debug, Deserialize)] @@ -146,6 +148,23 @@ pub struct Args { default_value = "sql" )] pub format: FormatName, + + /// Output compression + #[structopt( + short = "c", + long = "compress", + help = "Compress data output", + raw(possible_values = r#"&["gzip", "gz", "xz", "zstd", "zst"]"#) + )] + pub compression: Option, + + /// Output compression level + #[structopt( + long = "compress-level", + help = "Compression level (0-9 for gzip and xz, 1-21 for zstd)", + default_value = "6" + )] + pub compress_level: u8, } /// The default implementation of the argument suitable for *testing*. @@ -168,6 +187,8 @@ impl Default for Args { quiet: true, time_zone: Tz::UTC, format: FormatName::Sql, + compression: None, + compress_level: 6, } } } @@ -229,6 +250,7 @@ pub fn run(args: Args) -> Result<(), Error> { time_zone: args.time_zone, }; + let compress_level = args.compress_level; let env = Env { out_dir: args.out_dir, file_num_digits: args.files_count.to_string().len(), @@ -242,6 +264,7 @@ pub fn run(args: Args) -> Result<(), Error> { rows_count: args.rows_count, escape_backslash: args.escape_backslash, format: args.format, + compression: args.compression.map(|c| (c, compress_level)), }; env.write_schema(&template.content)?; @@ -392,6 +415,53 @@ impl FormatName { } } +/// Names of the compression output formats supported by `dbgen`. +#[derive(Copy, Clone, Debug, Deserialize)] +pub enum CompressionName { + /// Compress as gzip format (`*.gz`). + Gzip, + /// Compress as xz format (`*.xz`). + Xz, + /// Compress as Zstandard format (`*.zst`). + Zstd, +} + +impl FromStr for CompressionName { + type Err = Error; + fn from_str(name: &str) -> Result { + Ok(match name { + "gzip" | "gz" => CompressionName::Gzip, + "xz" => CompressionName::Xz, + "zstd" | "zst" => CompressionName::Zstd, + _ => failure::bail!("Unsupported format {}", name), + }) + } +} + +impl CompressionName { + /// Obtains the file extension when using this format. + fn extension(self) -> &'static str { + match self { + CompressionName::Gzip => "gz", + CompressionName::Xz => "xz", + CompressionName::Zstd => "zst", + } + } + + /// Wraps a writer with a compression layer on top. + fn wrap<'a, W: Write + 'a>(self, inner: W, level: u8) -> Box { + match self { + CompressionName::Gzip => Box::new(GzEncoder::new(inner, flate2::Compression::new(level.into()))), + CompressionName::Xz => Box::new(XzEncoder::new(inner, level.into())), + CompressionName::Zstd => Box::new( + zstd::Encoder::new(inner, level.into()) + .expect("valid zstd encoder") + .auto_finish(), + ), + } + } +} + /// Wrapping of a [`Write`] which counts how many bytes are written. struct WriteCountWrapper { inner: W, @@ -445,6 +515,7 @@ struct Env { rows_count: u32, escape_backslash: bool, format: FormatName, + compression: Option<(CompressionName, u8)>, } /// Information specific to a data file. @@ -464,14 +535,25 @@ impl Env { /// Writes a single data file. fn write_data_file(&self, info: &FileInfo, state: &mut State) -> Result<(), Error> { - let path = self.out_dir.join(format!( + let mut path = self.out_dir.join(format!( "{0}.{1:02$}.{3}", self.unique_name, info.file_index, self.file_num_digits, self.format.extension(), )); - let mut file = WriteCountWrapper::new(BufWriter::new(File::create(&path).with_path(&path)?)); + + let inner_writer = if let Some((compression, level)) = self.compression { + let mut path_string = path.into_os_string(); + path_string.push("."); + path_string.push(compression.extension()); + path = PathBuf::from(path_string); + compression.wrap(File::create(&path).with_path(&path)?, level) + } else { + Box::new(File::create(&path).with_path(&path)?) + }; + + let mut file = WriteCountWrapper::new(BufWriter::new(inner_writer)); file.skip_write = std::env::var("DBGEN_WRITE_TO_DEV_NULL") .map(|s| s == "1") .unwrap_or(false); diff --git a/tests/data/compress/flags.json b/tests/data/compress/flags.json new file mode 100644 index 0000000..82ecc88 --- /dev/null +++ b/tests/data/compress/flags.json @@ -0,0 +1,6 @@ +{ + "inserts_count": 1, + "rows_count": 100, + "compression": "Xz", + "compress_level": 9 +} diff --git a/tests/data/compress/result-schema.sql b/tests/data/compress/result-schema.sql new file mode 100644 index 0000000..eec768d --- /dev/null +++ b/tests/data/compress/result-schema.sql @@ -0,0 +1 @@ +CREATE TABLE result (); diff --git a/tests/data/compress/result.1.sql.xz b/tests/data/compress/result.1.sql.xz new file mode 100644 index 0000000000000000000000000000000000000000..da41f9f5285c0bfdd5e5f4766006d70be8343aee GIT binary patch literal 392 zcmV;30eAlWH+ooF000E$*0e?f03iV!0000G&sfah1Wo}+T>vDLhO4f+*EpYqfm>9# zVwx$EAkZW}qJEvSQQ1-0h(M1hTnzEQV+p6ROeyELJVvnzN?5KdLt~3=w%J`&qejl3 z3<_HNB{&BKe*q|sW89p1qQ%!Tjc}oA?-fETFfFlPMmzg1_{7Q`-#ztL0J2|cvq9yk z1UNjIvaJg=t&~5JxJiM`N*|MqNboxVYDI7mNuD!Bqr^$!HY*yx;0I3hi0!6Pj;*$8 z(>^;B(?k+0Z|0z4+)e=>N2ogF7#d^g^toYozdpX;>{axJF6WwR7I>+*gF*qjn!>!R z(@M70(YXlg=}$@L=Q{oSsdy}FEf4ZnZIZv~SvfzFZ6^+{m{l?NH}2c?c;RFpe=Fm$ zO-B|8L;5Eu_nA_HvIUwdbCvi7BkJ5-7gzh5JIOD`%xuZlCzuwVVp8h1{$72(I+xp= m0002f$r%mgx+h!!0ptSD2mk=VaeWlA#Ao{g000001X)^5bh5Dk literal 0 HcmV?d00001 diff --git a/tests/data/compress/template.sql b/tests/data/compress/template.sql new file mode 100644 index 0000000..711a57b --- /dev/null +++ b/tests/data/compress/template.sql @@ -0,0 +1 @@ +CREATE TABLE result ({{ rownum }} {{ rownum * 7 }}); \ No newline at end of file