From 3a4ce7999d7de9a8f2de6d1bc98224ccc4f7466b Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Wed, 2 Oct 2024 23:47:52 -0700 Subject: [PATCH] added script for creating index (#213) --- CMakeLists.txt | 8 ++ docs/utilities.rst | 18 +++++ script/dftracer_anonymize | 34 ++++++--- script/dftracer_create_index | 144 +++++++++++++++++++++++++++++++++++ setup.py | 3 +- 5 files changed, 197 insertions(+), 10 deletions(-) create mode 100755 script/dftracer_create_index diff --git a/CMakeLists.txt b/CMakeLists.txt index 3320ce9..08951fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -459,6 +459,14 @@ install( bin ) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_create_index ${EXECUTABLE_OUTPUT_PATH}/dftracer_create_index COPYONLY) +install( + FILES + ${EXECUTABLE_OUTPUT_PATH}/dftracer_create_index + DESTINATION + bin +) + #cmake_policy(SET CMP0079 NEW) # In case that we need more control over the target building order if(DFTRACER_ENABLE_TESTS) diff --git a/docs/utilities.rst b/docs/utilities.rst index 8f8f489..369c103 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -87,3 +87,21 @@ Arguments for this script are: 4. **-h** display help 5. **-d input_directory** specify input directories. should contain .pfw or .pfw.gz files. 6. **-o output_directory** specify output directory. + +------------------ +Create Index script +------------------ + +The script compresses and creates index for all dftracer traces + +.. code-block:: bash + + /bin/usage: dftracer_create_index [-fcv] [-d input_directory] + +Arguments for this script are: + +1. **-f** override indices. +2. **-c** compress input file +3. **-v** enable verbose mode +4. **-h** display help +5. **-d input_directory** specify input directories. should contain .pfw or .pfw.gz files. diff --git a/script/dftracer_anonymize b/script/dftracer_anonymize index 68f7440..a55c47d 100755 --- a/script/dftracer_anonymize +++ b/script/dftracer_anonymize @@ -76,18 +76,30 @@ echo "Setting up output directory" rm -rf ${OUTPUT_DIR} mkdir -p ${OUTPUT_DIR} -pfw_count=`ls -1 $LOG_DIR/*.pfw 2> /dev/null | wc -l` -gz_count=`ls -1 $LOG_DIR/*.gz 2> /dev/null | wc -l` -total=$((pfw_count + gz_count)) +total=0 +for file in *.pfw*; do total=1; break; done if [ $total == 0 ]; then echo "The folder does not contain any pfw or pfw.gz files." exit 0 fi -count=1 - +files=("$LOG_DIR"/*.pfw*) +total=${#files[@]} +JOBS_LIMIT=64 # loop over logs -for file in "$LOG_DIR"/*.pfw*; do +for file_index in "${!files[@]}"; do + file=${files[$file_index]} + running_jobs=$(jobs -rp | wc -l) + if [ $running_jobs -ge $JOBS_LIMIT ]; then + date_echo "waiting for Running $running_jobs jobs to be less than $JOBS_LIMIT" + while [ $running_jobs -ge $JOBS_LIMIT ] + do + sleep 1 + running_jobs=$(jobs -rp | wc -l) + done + date_echo "Running $running_jobs jobs are now less than $JOBS_LIMIT" + fi + { # only look at files if [ -f "$file" ]; then # calculate basename and copy files @@ -97,7 +109,7 @@ for file in "$LOG_DIR"/*.pfw*; do cp $LOG_DIR/$filename $OUTPUT_DIR/.tmp.$filename if [ "$ext" == "gz" ]; then # if file is gz get the name - name=${count%.pfw.gz} + name=${file_index%.pfw.gz} echo "extracted name $name" # replace non utf characters gunzip -c $OUTPUT_DIR/.tmp.$filename | sed -e "s/${USER}/USER/g" > $OUTPUT_DIR/$name.pfw @@ -110,7 +122,7 @@ for file in "$LOG_DIR"/*.pfw*; do fi else # if file is pfw get the name - name=${count%.pfw} + name=${file_index%.pfw} echo "extracted name $name" # replace non utf characters cat $OUTPUT_DIR/.tmp.$filename | sed -e "s/${USER}/USER/g" > $OUTPUT_DIR/$name.pfw @@ -124,5 +136,9 @@ for file in "$LOG_DIR"/*.pfw*; do # remove temp file rm $OUTPUT_DIR/.tmp.$filename fi - count=$((count+1)) + } & done + +wait + +date_echo Finished anonymization of traces. diff --git a/script/dftracer_create_index b/script/dftracer_create_index new file mode 100755 index 0000000..373fe1f --- /dev/null +++ b/script/dftracer_create_index @@ -0,0 +1,144 @@ +#!/bin/bash + +# The script creates indices for the dftracer traces +# This has the following signature. +# +# usage: dftracer_create_index [-fcv] [-d input_directory] +# -f override indices +# -c compress input +# -v enable verbose mode +# -h display help +# -d input_directory specify input directories. should contain .pfw or .pfw.gz files. + +date_echo() { + dt=$(date '+%d/%m/%Y %H:%M:%S'); + echo "$dt $@" +} + +LOG_DIR=$PWD +override=0 +compressed=0 + +PPWD=$PWD + +function usage { + echo "usage: $(basename $0) [-fcv] [-d input_directory]" + echo " -f override indices" + echo " -c compress input" + echo " -v enable verbose mode" + echo " -h display help" + echo " -d input_directory specify input directories. should contain .pfw or .pfw.gz files." + exit 1 +} +while getopts ':cvfd:h' opt; do + case "$opt" in + d) + LOG_DIR="${OPTARG}" + ;; + f) + override=1 + ;; + v) + set -x + ;; + c) + compressed=1 + ;; + h) + usage + exit 0 + ;; + + :) + echo -e "option requires an argument.\n" + usage + exit 1 + ;; + + ?) + echo -e "Invalid command option.\n" + usage + exit 1 + ;; + esac +done +shift "$(($OPTIND -1))" +total=0 +for file in *.pfw*; do total=1; break; done + +# pfw_count=$(ls -1 $LOG_DIR/*.pfw 2> /dev/null | wc -l) +# gz_count=$(ls -1 $LOG_DIR/*.gz 2> /dev/null | wc -l) +# total=$((pfw_count + gz_count)) +if [ $total == 0 ]; then + date_echo "The folder does not contain any pfw or pfw.gz files." + exit 1 +fi + +python -c "import zindex_py;" +if [[ $? != 0 ]]; then + date_echo "failure: $?: zindex not found. Please install zindex with: pip install zindex_py" + exit 1 +fi +zindex_exec=$(python3 -c 'import zindex_py;import site; sp=site.getsitepackages()[0]; print(f"{sp}/zindex_py/bin/zindex")') +if [ ! -f "${zindex_exec}" ]; then + date_echo "failure: $?: zindex not found. Please install zindex with: pip install zindex_py" + exit 1 +else + date_echo "Found zindex executable at ${zindex_exec}" +fi + +if [ "$override" == "1" ]; then + date_echo "Removing existing indices as override is passed." + rm $LOG_DIR/*.zindex +fi + +pushd $LOG_DIR +JOBS_LIMIT=$(nproc --all) +files=("$LOG_DIR"/*.pfw*) +total=${#files[@]} +# loop over logs +for file_index in "${!files[@]}"; do + file=${files[$file_index]} + running_jobs=$(jobs -rp | wc -l) + if [ $running_jobs -ge $JOBS_LIMIT ]; then + date_echo "waiting for Running $running_jobs jobs to be less than $JOBS_LIMIT" + while [ $running_jobs -ge $JOBS_LIMIT ] + do + sleep 1 + running_jobs=$(jobs -rp | wc -l) + done + date_echo "Running $running_jobs jobs are now less than $JOBS_LIMIT" + fi + # only look at files + if [ -f "$file" ]; then + # calculate basename and copy files + filename=$(basename -- "$file") + ext="${filename##*.}" + if [ "$ext" == "gz" ]; then + { + # if file is gz get the name + name=${filename%.pfw.gz} + if [ ! -f "$file.zindex" ]; then + $zindex_exec $file --index-file file:$file.zindex --regex 'id:([0-9]+)' --numeric --unique + fi + date_echo "Created index for file $file.gz $file_index of $total" + } & + else + { + # if file is pfw get the name + name=${filename%.pfw} + if [ ! -f "$file.gz.zindex" ]; then + if [ $compressed == 1 ]; then + date_echo "Compressing file $name" + gzip $file + $zindex_exec $file.gz --index-file file:$file.gz.zindex --regex 'id:([0-9]+)' --numeric --unique + fi + fi + date_echo "Created index for file $file.gz $file_index of $total" + } & + fi + fi +done +popd +wait +date_echo Creation of index finished diff --git a/setup.py b/setup.py index d7a8145..61d8c5e 100644 --- a/setup.py +++ b/setup.py @@ -195,7 +195,8 @@ def build_extension(self, ext: CMakeExtension) -> None: scripts=['script/dftracer_compact', 'script/dftracer_merge', 'script/dftracer_sanitize', - 'script/dftracer_anonymize'], + 'script/dftracer_anonymize', + 'script/dftracer_create_index',], package_dir={"dftracer": "dftracer", "dftracer_dbg": "dftracer_dbg", "dfanalyzer": "dfanalyzer"},