From ad6c48b63b34890251d94416f620e00474ba32eb Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 10 Dec 2024 00:38:29 -0500 Subject: [PATCH] feat: `template` add `--outsubdir-size` option when rendering files to a directory, have a simple mechanism to distribute files across subdirectories of --outsubdir-size. This makes it easier to handle and explore generated files, especially for large input CSVs. Also ensures the filesystem remains performant as it severely slows down when there are a large number of files in a directory --- src/cmd/template.rs | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/cmd/template.rs b/src/cmd/template.rs index 446eba5bc..f63300c2c 100644 --- a/src/cmd/template.rs +++ b/src/cmd/template.rs @@ -63,6 +63,14 @@ template arguments: The directory where the output files will be written. If it does not exist, it will be created. If not set, output will be sent to stdout or the specified --output. + When writing to , files are organized into subdirectories + of --outsubdir-size (default: 1000) files each to avoid filesystem + navigation & performance issues. + For example, with 3500 records: + /0000/0001.txt through /0000/1000.txt + /0001/1001.txt through /0001/2000.txt + /0002/2001.txt through /0002/3000.txt + /0003/3001.txt through /0003/4000.txt template options: --template MiniJinja template string to use (alternative to --template-file) -t, --template-file MiniJinja template file to use @@ -73,6 +81,8 @@ template options: Note that all the fields, including QSV_ROWNO, are available when defining the filename template. [default: QSV_ROWNO] + --outsubdir-size The number of files per subdirectory in . + [default: 1000] --customfilter-error The value to return when a custom filter returns an error. Use "" to return an empty string. [default: ] @@ -140,6 +150,7 @@ struct Args { flag_template_file: Option, flag_output: Option, flag_outfilename: String, + flag_outsubdir_size: u16, flag_customfilter_error: String, flag_jobs: Option, flag_batch: usize, @@ -479,11 +490,33 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .collect_into_vec(&mut batch_results); let mut outpath = std::path::PathBuf::new(); - for result_record in &batch_results { + let mut current_subdir = None; + let outsubdir_numfiles = args.flag_outsubdir_size as usize; + + for (idx, result_record) in batch_results.iter().enumerate() { if output_to_dir { // safety: this is safe as output_to_dir = args.arg_outdir.is_some() // and result_record.0 (the filename to use) is_some() outpath.push(args.arg_outdir.as_ref().unwrap()); + + // Create subdirectory for every outsubdir_size files + // to make it easier to handle & navigate generated files + // particularly, if we're using a large input CSV + let subdir_num = idx / outsubdir_numfiles; + + if current_subdir == Some(subdir_num) { + outpath.push(format!("{subdir_num:0width$}")); + } else { + // Only create new subdir when needed + let subdir_name = format!("{subdir_num:0width$}"); + outpath.push(&subdir_name); + + if !outpath.exists() { + fs::create_dir(&outpath)?; + } + current_subdir = Some(subdir_num); + } + outpath.push(result_record.0.as_deref().unwrap()); // if output_to_dir is true, we'll be writing a LOT of files (one for each row)