From cc91d2a41c4178c55983934c9021c1c1bba77aac Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 3 Nov 2024 12:51:51 -0500 Subject: [PATCH 1/2] feat: `fetchpost` add `--payload-tpl` option to allow constructing JSON payload using MiniJinja templates --- src/cmd/fetchpost.rs | 69 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/src/cmd/fetchpost.rs b/src/cmd/fetchpost.rs index f546179ed..cef2411c8 100644 --- a/src/cmd/fetchpost.rs +++ b/src/cmd/fetchpost.rs @@ -3,6 +3,12 @@ static USAGE: &str = r#" Fetchpost fetches data from web services for every row using HTTP Post. As opposed to fetch, which uses HTTP Get. +CSV data is posted using two methods: +1. Column-list using the argument + The columns are used to construct the form data. +2. MiniJinja template using the --payload-tpl option + The template file is used to construct the JSON payload. + Fetchpost is integrated with `jaq` (a jq clone) to directly parse out values from an API JSON response. (See https://github.com/01mf02/jaq for more info on how to use the jaq JSON Query Language) @@ -121,7 +127,7 @@ Usage: qsv fetchpost ( ) [--jaq | --jaqfile ] [--http-header ...] [options] [] qsv fetchpost --help -Fetchpost options: +Fetchpost arguments: Name of the column with the URL. Otherwise, if the argument starts with `http`, the URL to use. Comma-delimited list of columns to insert into the HTTP Post body. @@ -130,6 +136,10 @@ Fetchpost options: with more indexing). Column ranges can also be specified. Finally, columns can be selected using regular expressions. See 'qsv select --help' for examples. + +Fetchpost options: + -t, --payload-tpl Instead of , use a MiniJinja template to construct a + JSON payload in the HTTP Post body. -c, --new-column Put the fetched values in a new column. Specifying this option results in a CSV. Otherwise, the output is in JSONL format. --jaq Apply jaq selector to API returned JSON response. @@ -241,6 +251,7 @@ use log::{ debug, error, info, log_enabled, warn, Level::{Debug, Trace, Warn}, }; +use minijinja::Environment; use rand::Rng; use regex::Regex; use reqwest::{ @@ -249,7 +260,6 @@ use reqwest::{ }; use serde::Deserialize; use serde_json::{json, Value}; -use simdutf8::basic::from_utf8; use simple_expand_tilde::expand_tilde; use url::Url; @@ -265,6 +275,7 @@ use crate::{ #[derive(Deserialize)] struct Args { + flag_payload_tpl: Option, flag_new_column: Option, flag_jaq: Option, flag_jaqfile: Option, @@ -439,13 +450,21 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }; // validate column-list is a list of valid column names - let cl_config = Config::new(args.arg_input.as_ref()) - .delimiter(args.flag_delimiter) - .trim(csv::Trim::All) - .no_headers(args.flag_no_headers) - .select(args.arg_column_list.clone()); + let cl_config = if args.flag_payload_tpl.is_none() { + Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .trim(csv::Trim::All) + .no_headers(args.flag_no_headers) + .select(args.arg_column_list.clone()) + } else { + Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .trim(csv::Trim::All) + .no_headers(args.flag_no_headers) + // we're constructing a payload, ensure all the columns are selected + .select(SelectColumns::parse("1-")?) + }; let col_list = cl_config.selection(&headers)?; - debug!("column-list: {col_list:?}"); // check if the url_column arg was passed as a URL literal // or as a column selector @@ -635,6 +654,18 @@ pub fn run(argv: &[&str]) -> CliResult<()> { report_wtr.write_byte_record(&report_headers)?; } + let mut template_content = String::new(); + let mut build_payload = false; + let payload_env_option = if let Some(template_file) = args.flag_payload_tpl { + template_content = fs::read_to_string(template_file)?; + let mut env = Environment::new(); + env.add_template("template", &template_content)?; + build_payload = true; + Some(env) + } else { + None + }; + // amortize memory allocations // why optimize for mem & speed, when we're just doing single-threaded, throttled URL fetches? // we still optimize since fetch is backed by a memoized cache (in memory or Redis, when --redis @@ -681,6 +712,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .collect(); let debug_flag = log_enabled!(Debug); + let mut rendered_json: Value; + let payload_env = if build_payload { + payload_env_option.unwrap() + } else { + Environment::empty() + }; while rdr.read_byte_record(&mut record)? { if show_progress { @@ -697,10 +734,22 @@ pub fn run(argv: &[&str]) -> CliResult<()> { form_body_jsonmap.insert( (header_key_vec[*col_idx]).to_string(), serde_json::Value::String( - from_utf8(&record[*col_idx]).unwrap_or_default().to_owned(), + simdutf8::basic::from_utf8(record.get(*col_idx).unwrap_or_default()) + .unwrap_or_default() + .to_owned(), ), ); } + + if build_payload { + rendered_json = serde_json::from_str( + &payload_env + .get_template("template")? + .render(&form_body_jsonmap)?, + )?; + form_body_jsonmap.clone_from(rendered_json.as_object().ok_or("Expected JSON object")?); + } + if debug_flag { // deserializing the form_body_jsonmap to a string is expensive // so we only do it when debug is enabled @@ -709,7 +758,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { if literal_url_used { url.clone_from(&literal_url); - } else if let Ok(s) = from_utf8(&record[column_index]) { + } else if let Ok(s) = simdutf8::basic::from_utf8(&record[column_index]) { s.clone_into(&mut url); } else { url = String::new(); From a2813a8e0fc39a5becf79229cb029b2fda29ef62 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 3 Nov 2024 12:52:34 -0500 Subject: [PATCH 2/2] tests: add `fetchpost --payload-tpl` tests --- tests/test_fetch.rs | 184 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/tests/test_fetch.rs b/tests/test_fetch.rs index 9a566f0f9..72d2c44b2 100644 --- a/tests/test_fetch.rs +++ b/tests/test_fetch.rs @@ -1513,3 +1513,187 @@ fn fetchpost_simple_report() { assert_eq!(got, expected); } + +#[test] +fn fetchpost_payload_template() { + let wrk = Workdir::new("fetchpost_tpl"); + wrk.create( + "data.csv", + vec![ + svec!["first_name", "last_name", "age", "city"], + svec!["John", "Smith", "35", "New York"], + svec!["Jane", "Doe", "28", "Los Angeles"], + svec!["Bob", "Jones", "42", "Chicago"], + ], + ); + + // Create template file + wrk.create_from_string( + "payload.tpl", + r#"{ + "firstName": "{{ first_name }}", + "lastName": "{{ last_name }}", + "age": {{ age }}, + "city": "{{ city }}" +}"#, + ); + + let mut cmd = wrk.command("fetchpost"); + cmd.arg("https://httpbin.org/post") + .arg("1-") + .arg("--payload-tpl") + .arg("payload.tpl") + .arg("--new-column") + .arg("response") + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + + let expected = vec![ + svec!["first_name", "last_name", "age", "city", "response"], + svec![ + "John", + "Smith", + "35", + "New York", + r#"{"args":{},"data":"","files":{},"form":{"age":"35","city":"New York","firstName":"John","lastName":"Smith"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"50","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-6b2f608527e3b127729e8409"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + svec![ + "Jane", + "Doe", + "28", + "Los Angeles", + r#"{"args":{},"data":"","files":{},"form":{"age":"28","city":"Los Angeles","firstName":"Jane","lastName":"Doe"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"51","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-47c9d7ed1247562762fdd379"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + svec![ + "Bob", + "Jones", + "42", + "Chicago", + r#"{"args":{},"data":"","files":{},"form":{"age":"42","city":"Chicago","firstName":"Bob","lastName":"Jones"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"48","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-784c0cdf4d78bf1257f9a4d4"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + ]; + + for (got_row, expected_row) in got.iter().skip(1).zip(expected.iter().skip(1)) { + // Assert first 4 columns match + assert_eq!(&got_row[..4], &expected_row[..4]); + // Assert the first 50 characters of response column match + assert_eq!( + &got_row[4][..50], + &expected_row[4][..50], + "Response column first 50 chars mismatch" + ); + } +} + +#[test] +fn fetchpost_payload_template_with_report() { + let wrk = Workdir::new("fetchpost_tpl_report"); + wrk.create( + "data.csv", + vec![ + svec!["first_name", "last_name", "age", "city"], + svec!["John", "Smith", "35", "New York"], + svec!["Jane", "Doe", "28", "Los Angeles"], + svec!["Bob", "Jones", "42", "Chicago"], + ], + ); + + // Create template file + wrk.create_from_string( + "payload.tpl", + r#"{ + "firstName": "{{ first_name }}", + "lastName": "{{ last_name }}", + "age": {{ age }}, + "city": "{{ city }}" +}"#, + ); + + let mut cmd = wrk.command("fetchpost"); + cmd.arg("https://httpbin.org/post") + .arg("1-") + .arg("--payload-tpl") + .arg("payload.tpl") + .arg("--new-column") + .arg("response") + .arg("--report") + .arg("short") + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + + let expected = vec![ + svec!["first_name", "last_name", "age", "city", "response"], + svec![ + "John", + "Smith", + "35", + "New York", + r#"{"args":{},"data":"","files":{},"form":{"age":"35","city":"New York","firstName":"John","lastName":"Smith"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"50","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-6b2f608527e3b127729e8409"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + svec![ + "Jane", + "Doe", + "28", + "Los Angeles", + r#"{"args":{},"data":"","files":{},"form":{"age":"28","city":"Los Angeles","firstName":"Jane","lastName":"Doe"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"51","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-47c9d7ed1247562762fdd379"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + svec![ + "Bob", + "Jones", + "42", + "Chicago", + r#"{"args":{},"data":"","files":{},"form":{"age":"42","city":"Chicago","firstName":"Bob","lastName":"Jones"},"headers":{"Accept":"*/*","Accept-Encoding":"zstd;q=1.0, br;q=0.8, gzip;q=0.6, deflate;q=0.4, *;q=0.2","Content-Length":"48","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"qsv/0.137.0 (aarch64-apple-darwin; fetchpost; compiled; https://github.com/jqnatividad/qsv)","X-Amzn-Trace-Id":"Root=1-6727a9d5-784c0cdf4d78bf1257f9a4d4"},"json":null,"origin":"149.88.100.35","url":"https://httpbin.org/post"}"# + ], + ]; + + for (got_row, expected_row) in got.iter().skip(1).zip(expected.iter().skip(1)) { + // Assert first 4 columns match + assert_eq!(&got_row[..4], &expected_row[..4]); + // Assert the first 50 characters of response column match + assert_eq!( + &got_row[4][..50], + &expected_row[4][..50], + "Response column first 50 chars mismatch" + ); + } + + let mut cmd = wrk.command("select"); + cmd.arg("url,form,status,cache_hit,retries,response") + .arg(wrk.load_test_file("data.csv.fetchpost-report.tsv")); + + let got: Vec> = wrk.read_stdout(&mut cmd); + + let expected = vec![ + svec!["url", "form", "status", "cache_hit", "retries", "response"], + svec![ + "https://httpbin.org/post", + "{\"bool_col\": String(\"true\"), \"col1\": String(\"a\"), \"number_col\": \ + String(\"42\")}", + "200", + "0", + "0", + r#"{"bool_col": String("true"), "col1": String("a"), "number_col": String("42")}"# + ], + svec![ + "https://httpbin.org/post", + "{\"bool_col\": String(\"false\"), \"col1\": String(\"b\"), \"number_col\": \ + String(\"3.14\")}", + "200", + "0", + "0", + r#"{"bool_col": String("false"), "col1": String("b"), "number_col": String("3.14")}"# + ], + svec![ + "https://httpbin.org/post", + "{\"bool_col\": String(\"true\"), \"col1\": String(\"c\"), \"number_col\": \ + String(\"666\")}", + "200", + "0", + "0", + r#"{"bool_col": String("true"), "col1": String("c"), "number_col": String("666")}"# + ], + ]; + assert_eq!(got, expected); +}