Skip to content

Commit

Permalink
Support compression.
Browse files Browse the repository at this point in the history
  • Loading branch information
kennytm committed Apr 13, 2019
1 parent 05d445a commit 35e09aa
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 3 deletions.
19 changes: 19 additions & 0 deletions CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,30 @@ More options
* `sql`
* `csv`

* `-c «ALG»`, `--compress «ALG»` / `--compress-level «LEVEL»`

Compress the data output. Possible algorithms are:

| Algorithm | Levels |
|-----------|--------|
| [gzip] | 0–9 |
| [xz] | 0–9 |
| [zstd] | 1–21 |

The compression level defaults to 6 if not specified.

Since the data are randomly generated, the compression ratio is typically not very high (around
70% of uncompressed input). We do not recommend using the algorithm "xz" here, nor using very
high compression levels.

[ChaCha20]: https://cr.yp.to/chacha.html
[HC-128]: https://www.ntu.edu.sg/home/wuhj/research/hc/index.html
[ISAAC]: http://www.burtleburtle.net/bob/rand/isaacafa.html
[Xorshift]: https://en.wikipedia.org/wiki/Xorshift
[PCG32]: http://www.pcg-random.org/
[gzip]: https://en.wikipedia.org/wiki/Gzip
[xz]: https://en.wikipedia.org/wiki/Xz
[zstd]: https://facebook.github.io/zstd/

[`NO_BACKSLASH_ESCAPES`]: https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sqlmode_no_backslash_escapes
[`standard_conforming_strings`]: https://www.postgresql.org/docs/current/static/runtime-config-compatible.html#GUC-STANDARD-CONFORMING-STRINGS
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ num-traits = "0.2"
rayon = "1.0"
zipf = "5.0"
chrono = { version = "0.4", default-features = false }
chrono-tz = { version = "0.5", features = ["serde"] }
chrono-tz = { version = "0.5.1", features = ["serde"] }
ryu = "0.2"
serde_derive = "1.0"
serde = "1.0"
Expand All @@ -42,6 +42,9 @@ rand_chacha = "0.1"
rand_hc = "0.1"
rand_xorshift = "0.1"
shlex = "0.1"
flate2 = "1.0"
xz2 = "0.1"
zstd = { version = "0.4", default-features = false }

[dev-dependencies]
regex = "1.1"
Expand Down
86 changes: 84 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::{
use chrono_tz::Tz;
use data_encoding::{DecodeError, DecodeKind, HEXLOWER_PERMISSIVE};
use failure::{Error, Fail, ResultExt};
use flate2::write::GzEncoder;
use muldiv::MulDiv;
use pbr::{MultiBar, Units};
use rand::{
Expand All @@ -30,6 +31,7 @@ use std::{
time::Duration,
};
use structopt::StructOpt;
use xz2::write::XzEncoder;

/// Arguments to the `dbgen` CLI program.
#[derive(StructOpt, Debug, Deserialize)]
Expand Down Expand Up @@ -146,6 +148,23 @@ pub struct Args {
default_value = "sql"
)]
pub format: FormatName,

/// Output compression
#[structopt(
short = "c",
long = "compress",
help = "Compress data output",
raw(possible_values = r#"&["gzip", "gz", "xz", "zstd", "zst"]"#)
)]
pub compression: Option<CompressionName>,

/// Output compression level
#[structopt(
long = "compress-level",
help = "Compression level (0-9 for gzip and xz, 1-21 for zstd)",
default_value = "6"
)]
pub compress_level: u8,
}

/// The default implementation of the argument suitable for *testing*.
Expand All @@ -168,6 +187,8 @@ impl Default for Args {
quiet: true,
time_zone: Tz::UTC,
format: FormatName::Sql,
compression: None,
compress_level: 6,
}
}
}
Expand Down Expand Up @@ -229,6 +250,7 @@ pub fn run(args: Args) -> Result<(), Error> {
time_zone: args.time_zone,
};

let compress_level = args.compress_level;
let env = Env {
out_dir: args.out_dir,
file_num_digits: args.files_count.to_string().len(),
Expand All @@ -242,6 +264,7 @@ pub fn run(args: Args) -> Result<(), Error> {
rows_count: args.rows_count,
escape_backslash: args.escape_backslash,
format: args.format,
compression: args.compression.map(|c| (c, compress_level)),
};

env.write_schema(&template.content)?;
Expand Down Expand Up @@ -392,6 +415,53 @@ impl FormatName {
}
}

/// Names of the compression output formats supported by `dbgen`.
#[derive(Copy, Clone, Debug, Deserialize)]
pub enum CompressionName {
/// Compress as gzip format (`*.gz`).
Gzip,
/// Compress as xz format (`*.xz`).
Xz,
/// Compress as Zstandard format (`*.zst`).
Zstd,
}

impl FromStr for CompressionName {
type Err = Error;
fn from_str(name: &str) -> Result<Self, Self::Err> {
Ok(match name {
"gzip" | "gz" => CompressionName::Gzip,
"xz" => CompressionName::Xz,
"zstd" | "zst" => CompressionName::Zstd,
_ => failure::bail!("Unsupported format {}", name),
})
}
}

impl CompressionName {
/// Obtains the file extension when using this format.
fn extension(self) -> &'static str {
match self {
CompressionName::Gzip => "gz",
CompressionName::Xz => "xz",
CompressionName::Zstd => "zst",
}
}

/// Wraps a writer with a compression layer on top.
fn wrap<'a, W: Write + 'a>(self, inner: W, level: u8) -> Box<dyn Write + 'a> {
match self {
CompressionName::Gzip => Box::new(GzEncoder::new(inner, flate2::Compression::new(level.into()))),
CompressionName::Xz => Box::new(XzEncoder::new(inner, level.into())),
CompressionName::Zstd => Box::new(
zstd::Encoder::new(inner, level.into())
.expect("valid zstd encoder")
.auto_finish(),
),
}
}
}

/// Wrapping of a [`Write`] which counts how many bytes are written.
struct WriteCountWrapper<W: Write> {
inner: W,
Expand Down Expand Up @@ -445,6 +515,7 @@ struct Env {
rows_count: u32,
escape_backslash: bool,
format: FormatName,
compression: Option<(CompressionName, u8)>,
}

/// Information specific to a data file.
Expand All @@ -464,14 +535,25 @@ impl Env {

/// Writes a single data file.
fn write_data_file(&self, info: &FileInfo, state: &mut State) -> Result<(), Error> {
let path = self.out_dir.join(format!(
let mut path = self.out_dir.join(format!(
"{0}.{1:02$}.{3}",
self.unique_name,
info.file_index,
self.file_num_digits,
self.format.extension(),
));
let mut file = WriteCountWrapper::new(BufWriter::new(File::create(&path).with_path(&path)?));

let inner_writer = if let Some((compression, level)) = self.compression {
let mut path_string = path.into_os_string();
path_string.push(".");
path_string.push(compression.extension());
path = PathBuf::from(path_string);
compression.wrap(File::create(&path).with_path(&path)?, level)
} else {
Box::new(File::create(&path).with_path(&path)?)
};

let mut file = WriteCountWrapper::new(BufWriter::new(inner_writer));
file.skip_write = std::env::var("DBGEN_WRITE_TO_DEV_NULL")
.map(|s| s == "1")
.unwrap_or(false);
Expand Down
6 changes: 6 additions & 0 deletions tests/data/compress/flags.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"inserts_count": 1,
"rows_count": 100,
"compression": "Xz",
"compress_level": 9
}
1 change: 1 addition & 0 deletions tests/data/compress/result-schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE TABLE result ();
Binary file added tests/data/compress/result.1.sql.xz
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/compress/template.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE TABLE result ({{ rownum }} {{ rownum * 7 }});

0 comments on commit 35e09aa

Please sign in to comment.