forked from dathere/qsv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
safenames.rs
269 lines (228 loc) · 10.3 KB
/
safenames.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
static USAGE: &str = r#"
Modify headers of a CSV to only have "safe" names - guaranteed "database-ready" names
(optimized specifically for PostgreSQL column identifiers).
Fold to lowercase. Trim leading & trailing whitespaces. Replace whitespace/non-alphanumeric
characters with _. If name starts with a number & check_first_char is true, prepend the unsafe prefix.
If a header with the same name already exists, append a sequence suffix (e.g. col, col_2, col_3).
Names are limited to 60 characters in length. Empty names are replaced with the unsafe prefix.
In addition, specifically because of CKAN Datastore requirements:
- Headers with leading underscores are replaced with "unsafe_" prefix.
- Headers that are named "_id" are renamed to "reserved__id".
These CKAN Datastore options can be configured via the --prefix & --reserved options, respectively.
In Always (a) and Conditional (c) mode, returns number of modified headers to stderr,
and sends CSV with safe headers output to stdout.
In Verify (v) mode, returns number of unsafe headers to stderr.
In Verbose (V) mode, returns number of headers; duplicate count and unsafe & safe headers to stderr.
No stdout output is generated in Verify and Verbose mode.
In JSON (j) mode, returns Verbose mode info in minified JSON to stdout.
In Pretty JSON (J) mode, returns Verbose mode info in pretty printed JSON to stdout.
Given data.csv:
c1,12_col,Col with Embedded Spaces,,Column!@Invalid+Chars,c1
1,a2,a3,a4,a5,a6
$ qsv safenames data.csv
c1,unsafe_12_col,col_with_embedded_spaces,unsafe_,column__invalid_chars,c1_2
1,a2,a3,a4,a5,a6
stderr: 5
Conditionally rename headers, allowing "quoted identifiers":
$ qsv safenames --mode c data.csv
c1,unsafe_12_col,Col with Embedded Spaces,unsafe_,column__invalid_chars,c1_2
1,a2,a3,a4,a5,a6
stderr: 4
Verify how many "unsafe" headers are found:
$ qsv safenames --mode v data.csv
stderr: 4
Verbose mode:
$ qsv safenames --mode V data.csv
stderr: 6 header/s
1 duplicate/s: "c1:2"
4 unsafe header/s: ["12_col", "Col with Embedded Spaces", "", "Column!@Invalid+Chars"]
1 safe header/s: ["c1"]
Note that even if "Col with Embedded Spaces" is technically safe, it is generally discouraged.
Though it can be created as a "quoted identifier" in PostgreSQL, it is still marked "unsafe"
by default, unless mode is set to "conditional."
It is discouraged because the embedded spaces can cause problems later on.
(see https://lerner.co.il/2013/11/30/quoting-postgresql/ for more info).
For more examples, see https://github.com/dathere/qsv/blob/master/tests/test_safenames.rs.
Usage:
qsv safenames [options] [<input>]
qsv safenames --help
safenames options:
--mode <c|a|v|V|j|J> Rename header names to "safe" names - i.e.
guaranteed "database-ready" names.
It has six modes - conditional, always, verify, Verbose,
with Verbose having two submodes - JSON & pretty JSON.
conditional (c) - check first before renaming and allow
"quoted identifiers" - mixed case with embedded spaces.
always (a) - goes ahead and renames all headers
without checking if they're already "safe".
verify (v) - count "unsafe" header names without
modifying them. Note that verify does not count
"quoted identifiers" as unsafe.
verbose (V) - like verify, but verbose, showing
total header count, duplicates, unsafe headers & safe headers.
JSON (j) - similar to verbose in minified JSON.
pretty JSON (J) - verbose in pretty-printed JSON
[default: Always]
--reserved <list> Comma-delimited list of additional case-insensitive reserved names
that should be considered "unsafe." If a header name is found in
the reserved list, it will be prefixed with "reserved_".
[default: _id]
--prefix <string> Certain systems do not allow header names to start with "_" (e.g. CKAN Datastore).
This option allows the specification of the unsafe prefix to use when a header
starts with "_". [default: unsafe_]
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
Note that no output is generated for Verify and
Verbose modes.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use std::collections::HashMap;
use ahash::RandomState;
use serde::{Deserialize, Serialize};
use crate::{
config::{Config, Delimiter},
util, CliResult,
};
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_mode: String,
flag_reserved: String,
flag_prefix: String,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
}
#[derive(PartialEq)]
enum SafeNameMode {
Always,
Conditional,
Verify,
VerifyVerbose,
VerifyVerboseJSON,
VerifyVerbosePrettyJSON,
}
#[derive(Serialize, Deserialize)]
struct SafeNamesStruct {
header_count: usize,
duplicate_count: usize,
duplicate_headers: Vec<String>,
unsafe_headers: Vec<String>,
safe_headers: Vec<String>,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
// set SafeNames Mode
let first_letter = args.flag_mode.chars().next().unwrap_or_default();
let safenames_mode = match first_letter {
'c' | 'C' => SafeNameMode::Conditional,
'a' | 'A' => SafeNameMode::Always,
'v' => SafeNameMode::Verify,
'V' => SafeNameMode::VerifyVerbose,
'j' => SafeNameMode::VerifyVerboseJSON,
'J' => SafeNameMode::VerifyVerbosePrettyJSON,
_ => {
return fail_clierror!("Invalid mode: {}", args.flag_mode);
},
};
let reserved_names_vec: Vec<String> = args
.flag_reserved
.split(',')
.map(str::to_lowercase)
.collect();
let rconfig = Config::new(args.arg_input.as_ref()).delimiter(args.flag_delimiter);
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let old_headers = rdr.byte_headers()?;
let mut headers = csv::StringRecord::from_byte_record_lossy(old_headers.clone());
// trim enclosing quotes and spaces from headers as it messes up safenames
// csv library will automatically add quotes when necessary when we write it
let mut noquote_headers = csv::StringRecord::new();
for header in &headers {
noquote_headers.push_field(header.trim_matches(|c| c == '"' || c == ' '));
}
let (safe_headers, changed_count) = util::safe_header_names(
&noquote_headers,
true,
safenames_mode == SafeNameMode::Conditional,
Some(reserved_names_vec).as_ref(),
&args.flag_prefix,
false,
);
if let SafeNameMode::Conditional | SafeNameMode::Always = safenames_mode {
headers.clear();
for header_name in safe_headers {
headers.push_field(&header_name);
}
// write CSV with safe headers
wtr.write_record(headers.as_byte_record())?;
let mut record = csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
wtr.write_record(&record)?;
}
wtr.flush()?;
eprintln!("{changed_count}");
} else {
// Verify or VerifyVerbose Mode
let mut safenames_vec: Vec<String> = Vec::new();
let mut unsafenames_vec: Vec<String> = Vec::new();
let mut checkednames_map: HashMap<String, u16, RandomState> = HashMap::default();
let mut temp_string;
for header_name in &headers {
if safe_headers.contains(&header_name.to_string()) {
if !safenames_vec.contains(&header_name.to_string()) {
safenames_vec.push(header_name.to_string());
}
} else {
unsafenames_vec.push(header_name.to_string());
};
temp_string = header_name.to_string();
if let Some(count) = checkednames_map.get(&temp_string) {
checkednames_map.insert(temp_string, count + 1);
} else {
checkednames_map.insert(temp_string, 1);
};
}
let headers_count = headers.len();
let dupe_count = checkednames_map.values().filter(|&&v| v > 1).count();
let unsafe_count = unsafenames_vec.len();
let safe_count = safenames_vec.len();
let safenames_struct = SafeNamesStruct {
header_count: headers_count,
duplicate_count: dupe_count,
duplicate_headers: checkednames_map
.iter()
.filter(|(_, &v)| v > 1)
.map(|(k, v)| format!("{k}:{v}"))
.collect(),
unsafe_headers: unsafenames_vec.clone(),
safe_headers: safenames_vec.clone(),
};
match safenames_mode {
SafeNameMode::VerifyVerbose => {
eprintln!(
r#"{num_headers} header/s
{dupe_count} duplicate/s: {dupe_headers:?}
{unsafe_count} unsafe header/s: {unsafenames_vec:?}
{num_safeheaders} safe header/s: {safenames_vec:?}"#,
dupe_headers = safenames_struct.duplicate_headers.join(", "),
num_headers = headers_count,
num_safeheaders = safe_count
);
},
SafeNameMode::VerifyVerboseJSON | SafeNameMode::VerifyVerbosePrettyJSON => {
if safenames_mode == SafeNameMode::VerifyVerbosePrettyJSON {
println!(
"{}",
serde_json::to_string_pretty(&safenames_struct).unwrap()
);
} else {
println!("{}", serde_json::to_string(&safenames_struct).unwrap());
};
},
_ => eprintln!("{unsafe_count}"),
}
}
Ok(())
}