From 523207e94fdd956d1809143b348d27e3b7160f53 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:32:20 -0400 Subject: [PATCH 1/4] `benchmarks`: fixed binaries existence check as qsv_bin is used before existence was checked --- scripts/benchmarks.sh | 65 ++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index cdf602534..12513b0cd 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -51,6 +51,38 @@ benchmark_runs=3 data_filename=$(basename -- "$data") filestem="${data_filename%.*}" +# check if binaries are installed --------- +# check if qsv is installed +if ! command -v "$qsv_bin" &> /dev/null +then + echo "qsv could not be found" + echo "Please install Quicksilver (qsv) from https://qsv.dathere.com" + exit +fi + +# set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS +if [[ "$OSTYPE" == "darwin"* ]]; then + sevenz_bin=7zz +else + sevenz_bin=7z +fi + +# check if 7z is installed +if ! command -v "$sevenz_bin" &> /dev/null +then + echo "ERROR: $sevenz_bin could not be found" + echo "Please install 7-Zip v23.01 and above" + exit +fi + +# check if hyperfine is installed +if ! command -v hyperfine &> /dev/null +then + echo "ERROR: hyperfine could not be found" + echo "Please install hyperfine v1.17.0 and above" + exit +fi + # qsv version metadata ---------------- # get current version of qsv raw_version=$("$qsv_bin" --version) @@ -80,7 +112,7 @@ else fi # the version of this script -bm_version=2.1.0 +bm_version=2.1.1 function cleanup_files { # Clean up temporary files @@ -146,37 +178,6 @@ SECONDS=0 cleanup_files -# check if qsv is installed -if ! command -v "$qsv_bin" &> /dev/null -then - echo "qsv could not be found" - echo "Please install Quicksilver (qsv) from https://qsv.dathere.com" - exit -fi - -# set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS -if [[ "$OSTYPE" == "darwin"* ]]; then - sevenz_bin=7zz -else - sevenz_bin=7z -fi - -# check if 7z is installed -if ! command -v "$sevenz_bin" &> /dev/null -then - echo "ERROR: $sevenz_bin could not be found" - echo "Please install 7-Zip v23.01 and above" - exit -fi - -# check if hyperfine is installed -if ! command -v hyperfine &> /dev/null -then - echo "ERROR: hyperfine could not be found" - echo "Please install hyperfine v1.17.0 and above" - exit -fi - if [ ! -r "$data" ]; then echo "> Downloading Benchmark data..." curl -sS "$benchmark_data_url" > "$datazip" From 60b07a9395e5731d6dfd1692dfc929a4df8f7935 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:47:52 -0400 Subject: [PATCH 2/4] apply shellcheck static lint suggestions % shellcheck benchmarks.sh In benchmarks.sh line 89: version=$(echo $raw_version | cut -d' ' -f2 | cut -d'-' -f1) ^----------^ SC2086 (info): Double quote to prevent globbing and word splitting. Did you mean: version=$(echo "$raw_version" | cut -d' ' -f2 | cut -d'-' -f1) In benchmarks.sh line 91: platform=$(echo $raw_version | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/') ^-- SC2001 (style): See if you can use ${variable//search/replace} instead. ^----------^ SC2086 (info): Double quote to prevent globbing and word splitting. Did you mean: platform=$(echo "$raw_version" | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/') In benchmarks.sh line 93: kind=$(echo $raw_version | sed 's/.* \([a-zA-Z]*\)$/\1/') ^-- SC2001 (style): See if you can use ${variable//search/replace} instead. ^----------^ SC2086 (info): Double quote to prevent globbing and word splitting. Did you mean: kind=$(echo "$raw_version" | sed 's/.* \([a-zA-Z]*\)$/\1/') In benchmarks.sh line 191: printf "Benchmark data rowcount: %'.0f\n" $rowcount ^-------^ SC2086 (info): Double quote to prevent globbing and word splitting. Did you mean: printf "Benchmark data rowcount: %'.0f\n" "$rowcount" In benchmarks.sh line 233: local cmd="$@" ^--^ SC2124 (warning): Assigning an array to a string! Assign as array, or use * instead of @ to concatenate. In benchmarks.sh line 392: printf "> Commands to benchmark: $total_count, w/o index: $wo_index_count, with index: $with_index_count\n\n" ^-- SC2059 (info): Don't use variables in the printf format string. Use printf '..%s..' "$foo". For more information: https://www.shellcheck.net/wiki/SC2124 -- Assigning an array to a string! A... https://www.shellcheck.net/wiki/SC2059 -- Don't use variables in the printf... https://www.shellcheck.net/wiki/SC2086 -- Double quote to prevent globbing ... --- scripts/benchmarks.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index 12513b0cd..9f0954c71 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -86,11 +86,11 @@ fi # qsv version metadata ---------------- # get current version of qsv raw_version=$("$qsv_bin" --version) -version=$(echo $raw_version | cut -d' ' -f2 | cut -d'-' -f1) +version=$(echo "$raw_version" | cut -d' ' -f2 | cut -d'-' -f1) # get target platform from version -platform=$(echo $raw_version | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/') +platform=$(echo "$raw_version" | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/') # get qsv kind -kind=$(echo $raw_version | sed 's/.* \([a-zA-Z]*\)$/\1/') +kind=$(echo "$raw_version" | sed 's/.* \([a-zA-Z]*\)$/\1/') # get num cores & memory size if [[ "$OSTYPE" == "darwin"* ]]; then @@ -188,7 +188,7 @@ fi # we get the rowcount, just in case the benchmark data was modified by the user to tailor # the benchmark to their system/workload. We use the rowcount to compute records per second rowcount=$("$qsv_bin" count "$data") -printf "Benchmark data rowcount: %'.0f\n" $rowcount +printf "Benchmark data rowcount: %'.0f\n" "$rowcount" echo "" if [ ! -r communityboards.csv ]; then @@ -230,7 +230,7 @@ commands_with_index_name=() function add_command { local dest_array="$1" shift - local cmd="$@" + local cmd="$*" if [[ "$dest_array" == "without_index" ]]; then commands_without_index+=("$cmd") @@ -389,8 +389,7 @@ run --index validate_no_schema_index "$qsv_bin" validate "$data" with_index_count=${#commands_with_index[@]} wo_index_count=${#commands_without_index[@]} total_count=$((with_index_count + wo_index_count)) -printf "> Commands to benchmark: $total_count, w/o index: $wo_index_count, with index: $with_index_count\n\n" - +printf "> Commands to benchmark: %s, w/o index: %s, with index: %s\n\n" "$total_count" "$wo_index_count" "$with_index_count" # --------------------------------------- # Prepare benchmark results directory From cbcda54d0c256bddf481f136d51af89200e74c65 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:56:20 -0400 Subject: [PATCH 3/4] apply shell-format formatting --- scripts/benchmarks.sh | 115 ++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 59 deletions(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index 9f0954c71..9834c854b 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -20,7 +20,7 @@ # it was also designed to be a useful tool for users to benchmark qsv on their own systems, # so it be can run on hardware and workloads that reflect your requirements/environment. # -# Make sure you're using a release-optimized `qsv`. +# Make sure you're using a release-optimized `qsv`. # If you can't use the prebuilt binaries at https://github.com/jqnatividad/qsv/releases/latest, # build it to have at least the apply, geocode, luau, to and polars features enabled: # i.e. `cargo build --release --locked -F feature_capable,apply,geocode,luau,to,polars` or @@ -53,11 +53,10 @@ filestem="${data_filename%.*}" # check if binaries are installed --------- # check if qsv is installed -if ! command -v "$qsv_bin" &> /dev/null -then - echo "qsv could not be found" - echo "Please install Quicksilver (qsv) from https://qsv.dathere.com" - exit +if ! command -v "$qsv_bin" &>/dev/null; then + echo "qsv could not be found" + echo "Please install Quicksilver (qsv) from https://qsv.dathere.com" + exit fi # set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS @@ -68,19 +67,17 @@ else fi # check if 7z is installed -if ! command -v "$sevenz_bin" &> /dev/null -then - echo "ERROR: $sevenz_bin could not be found" - echo "Please install 7-Zip v23.01 and above" - exit +if ! command -v "$sevenz_bin" &>/dev/null; then + echo "ERROR: $sevenz_bin could not be found" + echo "Please install 7-Zip v23.01 and above" + exit fi # check if hyperfine is installed -if ! command -v hyperfine &> /dev/null -then - echo "ERROR: hyperfine could not be found" - echo "Please install hyperfine v1.17.0 and above" - exit +if ! command -v hyperfine &>/dev/null; then + echo "ERROR: hyperfine could not be found" + echo "Please install hyperfine v1.17.0 and above" + exit fi # qsv version metadata ---------------- @@ -94,21 +91,21 @@ kind=$(echo "$raw_version" | sed 's/.* \([a-zA-Z]*\)$/\1/') # get num cores & memory size if [[ "$OSTYPE" == "darwin"* ]]; then - # macOS - num_cores=$(sysctl -n hw.ncpu) - mem_size=$(sysctl -n hw.memsize) + # macOS + num_cores=$(sysctl -n hw.ncpu) + mem_size=$(sysctl -n hw.memsize) elif [[ "$OSTYPE" == "linux-gnu"* ]]; then - # Linux - num_cores=$(nproc) - mem_size=$(free -b | awk '/Mem/ {print $7}') + # Linux + num_cores=$(nproc) + mem_size=$(free -b | awk '/Mem/ {print $7}') elif [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" ]]; then - # Windows - num_cores=$(wmic cpu get NumberOfCores | grep -Eo '^[0-9]+') - mem_size=$(wmic OS get FreePhysicalMemory | grep -Eo '[0-9]+') - mem_size=$((mem_size * 1024)) + # Windows + num_cores=$(wmic cpu get NumberOfCores | grep -Eo '^[0-9]+') + mem_size=$(wmic OS get FreePhysicalMemory | grep -Eo '[0-9]+') + mem_size=$((mem_size * 1024)) else - echo "Unsupported operating system: $OSTYPE" - exit 1 + echo "Unsupported operating system: $OSTYPE" + exit 1 fi # the version of this script @@ -180,7 +177,7 @@ cleanup_files if [ ! -r "$data" ]; then echo "> Downloading Benchmark data..." - curl -sS "$benchmark_data_url" > "$datazip" + curl -sS "$benchmark_data_url" >"$datazip" "$sevenz_bin" e -y "$datazip" echo "" fi @@ -193,7 +190,7 @@ echo "" if [ ! -r communityboards.csv ]; then echo "> Downloading community board data..." - curl -sS https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/communityboards.csv > communityboards.csv + curl -sS https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/communityboards.csv >communityboards.csv echo "" fi @@ -212,11 +209,11 @@ if [ ! -r data_to_exclude.csv ]; then echo " benchmark_data.jsonl..." "$qsv_bin" tojsonl "$data" --output benchmark_data.jsonl echo " benchmark_data.schema.json..." - "$qsv_bin" schema "$data" --stdout > benchmark_data.csv.schema.json + "$qsv_bin" schema "$data" --stdout >benchmark_data.csv.schema.json echo " benchmark_data.snappy..." "$qsv_bin" snappy compress "$data" --output benchmark_data.snappy echo " searchset_patterns.txt..." - printf "homeless\npark\nnoise\n" > searchset_patterns.txt + printf "homeless\npark\nnoise\n" >searchset_patterns.txt echo "" fi @@ -230,8 +227,8 @@ commands_with_index_name=() function add_command { local dest_array="$1" shift - local cmd="$*" - + local cmd="$*" + if [[ "$dest_array" == "without_index" ]]; then commands_without_index+=("$cmd") else @@ -243,13 +240,13 @@ function run { local index= while true; do case "$1" in - --index) - index="yes" - shift - ;; - *) - break - ;; + --index) + index="yes" + shift + ;; + *) + break + ;; esac done @@ -277,7 +274,7 @@ run apply_datefmt "$qsv_bin apply datefmt \"Created Date\" $data" run apply_datefmt_multi "$qsv_bin apply datefmt \"Created Date,Closed Date,Due Date\" $data" run apply_dynfmt "$qsv_bin apply dynfmt --formatstr \"{Created Date} {Complaint Type} - {BBL} {City}\" --new-column new_col $data" run apply_emptyreplace "$qsv_bin" apply emptyreplace \"Bridge Highway Name\" --replacement Unspecified "$data" -run apply_op_eudex "$qsv_bin apply operations lower,eudex Agency --comparand Queens --new-column Agency_queens_soundex $data" +run apply_op_eudex "$qsv_bin apply operations lower,eudex Agency --comparand Queens --new-column Agency_queens_soundex $data" run apply_op_string "$qsv_bin apply operations lower Agency $data" run apply_op_similarity "$qsv_bin apply operations lower,simdln Agency --comparand brooklyn --new-column Agency_sim-brooklyn_score $data" run behead "$qsv_bin" behead "$data" @@ -353,15 +350,15 @@ run sortcheck_unsorted_all "$qsv_bin" sortcheck --all data_unsorted.csv run split "$qsv_bin" split --size 50000 split_tempdir "$data" run --index split_index "$qsv_bin" split --size 50000 split_tempdir "$data" run --index split_index_j1 "$qsv_bin" split --size 50000 -j 1 split_tempdir "$data" -run sqlp "$qsv_bin" sqlp "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' +run sqlp "$qsv_bin" sqlp "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' run sqlp_format_arrow "$qsv_bin" sqlp --format arrow "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' run sqlp_format_json "$qsv_bin" sqlp --format json "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' run sqlp_format_parquet "$qsv_bin" sqlp --format parquet "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' run sqlp_format_parquet_statistics "$qsv_bin" sqlp --format parquet --statistics "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' -run sqlp_lowmemory "$qsv_bin" sqlp "$data" -Q --low-memory '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' -run sqlp_nooptimizations "$qsv_bin" sqlp "$data" -Q --no-optimizations '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' -run sqlp_tryparsedates "$qsv_bin" sqlp "$data" -Q --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' -run sqlp_tryparsedates_inferlen "$qsv_bin" sqlp "$data" -Q --infer-len 10000 --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' +run sqlp_lowmemory "$qsv_bin" sqlp "$data" -Q --low-memory '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' +run sqlp_nooptimizations "$qsv_bin" sqlp "$data" -Q --no-optimizations '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' +run sqlp_tryparsedates "$qsv_bin" sqlp "$data" -Q --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' +run sqlp_tryparsedates_inferlen "$qsv_bin" sqlp "$data" -Q --infer-len 10000 --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"' run stats "$qsv_bin" stats --force "$data" run --index stats_index "$qsv_bin" stats --force "$data" run --index stats_index_j1 "$qsv_bin" stats -j 1 --force "$data" @@ -400,7 +397,7 @@ fi # Init latest_results.csv. It stores the benchmark results for this run rm -f results/latest_results.csv -echo "version,tstamp,name,mean,stddev,median,user,system,min,max" > results/latest_results.csv +echo "version,tstamp,name,mean,stddev,median,user,system,min,max" >results/latest_results.csv # check if the file benchmark_results.csv exists, if it doesn't create it # by copying the empty latest_results.csv @@ -432,11 +429,11 @@ for command_no_index in "${commands_without_index[@]}"; do echo "$name_idx. ${commands_without_index_name[$idx]}" hyperfine --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \ "$command_no_index" - + # prepend version, tstamp & benchmark name to the hyperfine results - echo "version,tstamp,name" > results/results_work.csv - echo "$version,$now,${commands_without_index_name[$idx]}" >> results/results_work.csv - + echo "version,tstamp,name" >results/results_work.csv + echo "$version,$now,${commands_without_index_name[$idx]}" >>results/results_work.csv + # remove the command column from the hyperfine results, we just need the name "$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv @@ -462,7 +459,7 @@ if [ "$with_index_count" -gt 0 ]; then rm -f "$data".idx "$qsv_bin" index "$data" "$qsv_bin" stats "$data" --everything --infer-dates --force \ - --output benchmark_work.stats.csv + --output benchmark_work.stats.csv fi idx=0 @@ -470,8 +467,8 @@ for command_with_index in "${commands_with_index[@]}"; do echo "$name_idx. ${commands_with_index_name[$idx]}" hyperfine --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \ "$command_with_index" - echo "version,tstamp,name" > results/results_work.csv - echo "$version,$now,${commands_with_index_name[$idx]}" >> results/results_work.csv + echo "version,tstamp,name" >results/results_work.csv + echo "$version,$now,${commands_with_index_name[$idx]}" >>results/results_work.csv "$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv "$qsv_bin" cat columns results/results_work.csv results/hf_result_nocmd.csv \ -o results/entry.csv @@ -493,13 +490,13 @@ done echo "" # sort the benchmark results by version, tstamp & name "$qsv_bin" sort --select version,tstamp,name results/latest_results.csv \ - -o results/results_work.csv + -o results/results_work.csv # compute records per second for each benchmark using luau by dividing rowcount by mean # we then round the result to a whole number and format with commas for readability luau_cmd="recs_per_sec=( $rowcount / mean); return numWithCommas(recs_per_sec)" "$qsv_bin" luau --begin file:benchmark_helper.luau map recs_per_sec "$luau_cmd" \ - results/results_work.csv -o results/latest_results.csv + results/results_work.csv -o results/latest_results.csv # Concatenate the final results of this run to results/bechmark_results.csv "$qsv_bin" cat rowskey results/latest_results.csv results/benchmark_results.csv \ @@ -516,7 +513,7 @@ elapsed=$SECONDS # Init latest_run_info.csv. It stores the benchmark run info for this run rm -f results/latest_run_info.tsv -echo -e "version\ttstamp\tlogtime\tbm_version\tplatform\tcores\tmem\tkind\targument\ttotal_count\two_index_count\twith_index_count\twarmup_runs\tbenchmark_runs\telapsed_secs\tversion_info" > results/latest_run_info.tsv +echo -e "version\ttstamp\tlogtime\tbm_version\tplatform\tcores\tmem\tkind\targument\ttotal_count\two_index_count\twith_index_count\twarmup_runs\tbenchmark_runs\telapsed_secs\tversion_info" >results/latest_run_info.tsv # check if the file run_info_history.csv exists, if it doesn't create it # by copying the empty latest_run_info.csv @@ -525,7 +522,7 @@ if [ ! -f "results/run_info_history.tsv" ]; then fi # append the run info to latest_run_info.csv -echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$kind\t$pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$raw_version" >> results/latest_run_info.tsv +echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$kind\t$pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$raw_version" >>results/latest_run_info.tsv # now update the run_info_history.tsv "$qsv_bin" cat rowskey results/latest_run_info.tsv results/run_info_history.tsv \ From e4f6341e24a5f3ede61da6873edbd4a9a49707c5 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:16:53 -0400 Subject: [PATCH 4/4] move script version up top; no need to store 7z file in /tmp [skip ci] --- scripts/benchmarks.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index 9834c854b..ccaa66a12 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -39,11 +39,14 @@ pat="$1" +# the version of this script +bm_version=2.1.1 + # configurable variables - change as needed to reflect your environment/workloads qsv_bin=qsv benchmark_data_url=https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/NYC_311_SR_2010-2020-sample-1M.7z # where to download the benchmark data compressed file - this could be a zip or 7z file -datazip=/tmp/NYC_311_SR_2010-2020-sample-1M.7z +datazip=NYC_311_SR_2010-2020-sample-1M.7z # where to store the benchmark data data=NYC_311_SR_2010-2020-sample-1M.csv warmup_runs=2 @@ -108,9 +111,6 @@ else exit 1 fi -# the version of this script -bm_version=2.1.1 - function cleanup_files { # Clean up temporary files rm -f "$filestem".csv.*