diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2fdd3a8..c8fb24f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: DFTracer Build and Test on: pull_request: - branches: [ main, dev ] + branches: [ main, develop ] push: jobs: build-and-test: @@ -79,9 +79,9 @@ jobs: mkdir coverage FILE=$PWD/coverage/coverage.json cd build - COVERALLS_REPO_TOKEN=${{ secrets.GITHUB_TOKEN }} gcovr -r ../ . --coveralls $FILE -e ../test/ -e ../src/example + COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS }} gcovr -r ../ . --coveralls $FILE -e ../test/ -e ../src/example if [ -e '$FILE' ]; then sed -i'' -e 's/"service_name": "github-actions-ci"/"service_name": "github"/' '$FILE' fi cat $FILE - curl -v -F json_file=@$FILE https://coveralls.io/api/v1/jobs \ No newline at end of file + curl -v -F json_file=@$FILE https://coveralls.io/api/v1/jobs diff --git a/.gitignore b/.gitignore index 76b27b5f..97c8184b 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,7 @@ # Environment and Dependency venv* +.venv* dependency/.spack-env dependency/spack.lock /build_env/ @@ -70,9 +71,11 @@ logs/*.log install __pycache__ dftracer/__pycache__ +*.log +examples/dfanalyzer/test-trace.pfw.gz.zindex # Install files -dftracer_py.egg-info +pydftracer.egg-info /dist/ /output/ @@ -83,3 +86,7 @@ dftracer_py.egg-info # Debug files /*.core +dfanalyzer/dask/run_dir +dfanalyzer/dask/logs +dfanalyzer/dask/scripts/STDIN.* +*.zindex \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 436b1b59..a1b87217 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,5 +9,87 @@ "cmake.configureEnvironment": { "DARSHAN_PRELOAD_LIB": "/usr/WS2/haridev/spack/opt/spack/linux-rhel8-zen2/gcc-10.3.1/darshan-runtime-3.4.4-vckxthkq2hzzxnwmk4owtzcnfmjwl23s/lib/libdarshan.so", "DFTRACER_TEST_MACHINE": "corona" + }, + "files.associations": { + "any": "cpp", + "functional": "cpp", + "optional": "cpp", + "sstream": "cpp", + "array": "cpp", + "atomic": "cpp", + "hash_map": "cpp", + "hash_set": "cpp", + "strstream": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "cfenv": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "cinttypes": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "codecvt": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "random": "cpp", + "ratio": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "format": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "shared_mutex": "cpp", + "span": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "valarray": "cpp", + "variant": "cpp" } } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0900d1ab..658e2bd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,8 @@ set(DFTRACER_PACKAGE_VERSION_MINOR "${DFTRACER_VERSION_PATCH}") set(DFTRACER_PACKAGE_STRING "${DFTRACER_PACKAGE_NAME} ${DFTRACER_PACKAGE_VERSION}") set(DFTRACER_PACKAGE_TARNAME "${DFTRACER_PACKAGE}") +set(DFTRACER_VERSION "(1, 0, 3)") + project(dftracer LANGUAGES C CXX) @@ -40,11 +42,14 @@ if (CMAKE_INSTALL_LIBDIR) ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DOCDIR}) set(DFTRACER_INSTALL_SYSCONFDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_SYSCONFDIR}/modulefiles) + set(DFTRACER_INSTALL_BINFDIR + ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) else () set(DFTRACER_LIBDIR "lib") set(DFTRACER_INSTALL_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include") set(DFTRACER_INSTALL_DOCDIR "${CMAKE_INSTALL_PREFIX}/doc") set(DFTRACER_INSTALL_SYSCONFDIR "${CMAKE_INSTALL_PREFIX}/etc/modulefiles") + set(DFTRACER_INSTALL_BINARYDIR "${CMAKE_INSTALL_PREFIX}/bin") message(STATUS "DFTRACER_LIBDIR set to ${DFTRACER_LIBDIR}") endif () @@ -130,6 +135,7 @@ option (DFTRACER_INSTALL_DEPENDENCIES "Install DFTracer dependencies" OFF) option (DFTRACER_ENABLE_TESTS "Enable tests for DFTRACER." OFF) option (DFTRACER_ENABLE_DLIO_BENCHMARK_TESTS "Enable dlio_benchmark tests" OFF) option (DFTRACER_ENABLE_PAPER_TESTS "Enable paper tests" OFF) +set (DFTRACER_TEST_LD_LIBRARY_PATH "" CACHE STRING "Additional LD_LIBRARY_PATH to be included on testing") #------------------------------------------------------------------------------ # Compiler setup @@ -325,6 +331,22 @@ if (DFTRACER_BUILD_PYTHON_BINDINGS) . ${CMAKE_BINARY_DIR}/symlink.sh \")") endif() +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_compact.sh ${EXECUTABLE_OUTPUT_PATH}/dftracer_compact COPYONLY) +install( + FILES + ${EXECUTABLE_OUTPUT_PATH}/dftracer_compact + DESTINATION + bin +) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/merge_pfw.sh ${EXECUTABLE_OUTPUT_PATH}/merge_pfw COPYONLY) +install( + FILES + ${EXECUTABLE_OUTPUT_PATH}/merge_pfw + DESTINATION + bin +) + #cmake_policy(SET CMP0079 NEW) # In case that we need more control over the target building order if(DFTRACER_ENABLE_TESTS) @@ -346,21 +368,28 @@ endif() #----------------------------------------------------------------------------- # Configure the config.cmake file for the build directory #----------------------------------------------------------------------------- -configure_file( +include(CMakePackageConfigHelpers) +configure_package_config_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/configure_files/${PROJECT_NAME}-config.cmake.build.in - ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/cmake/${PROJECT_NAME}/${PROJECT_NAME}-config.cmake @ONLY + "${CMAKE_BINARY_DIR}/${PROJECT_NAME}-config.cmake" + INSTALL_DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/cmake/${PROJECT_NAME}/${PROJECT_NAME}-config.cmake + PATH_VARS CMAKE_BINARY_DIR ) -configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/configure_files/${PROJECT_NAME}-config.cmake.install.in - ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/cmake/${PROJECT_NAME}/install/${PROJECT_NAME}-config.cmake @ONLY +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/configure_files/${PROJECT_NAME}-config.cmake.install.in + "${CMAKE_BINARY_DIR}/install/${PROJECT_NAME}-config.cmake" + INSTALL_DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/cmake/${PROJECT_NAME}/install/${PROJECT_NAME}-config.cmake + PATH_VARS CMAKE_BINARY_DIR ) install( FILES - ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/cmake/${PROJECT_NAME}/install/${PROJECT_NAME}-config.cmake + ${CMAKE_BINARY_DIR}/install/${PROJECT_NAME}-config.cmake DESTINATION ${DFTRACER_LIBDIR}/cmake/${PROJECT_NAME} ) +install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/dftracer-utils.cmake" + DESTINATION "${DFTRACER_LIBDIR}/cmake/dftracer") #----------------------------------------------------------------------------- # Configure the ${PROJECT_NAME}-config-version .cmake file for the install directory #----------------------------------------------------------------------------- diff --git a/README.md b/README.md index def0a23b..db0d6104 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,13 @@ [![Coverage Status](https://coveralls.io/repos/github/hariharan-devarajan/dftracer/badge.svg?branch=feature/apis)](https://coveralls.io/github/hariharan-devarajan/dftracer?branch=dev) [![Documentation Status](https://readthedocs.org/projects/dftracer/badge/?version=latest)](https://dftracer.readthedocs.io/en/latest/?badge=latest) -# DFTracer v1.0.2 +# DFTracer v1.0.3 A multi-level profiler for capturing application functions and low-level system I/O calls from deep learning workloads. Requirements for profiler 1. Python > 3.7 2. pybind11 - Requirements for analyzer 1. bokeh>=2.4.2 2. pybind11 @@ -23,20 +22,15 @@ Requirements for analyzer 10. python-intervals>=1.10.0.post1 11. matplotlib>=3.7.3 -## Build DFTracer with pip +## Installation -Users can easily install DFTracer using pip. This is the way most python packages are installed. -This method would work for both native python environments and conda environments. +Users can easily install DFTracer using pip. This is the way most Python packages are installed. +This method would work for both native Python environments and Conda environments. -### From source +### From PyPI ```bash - git clone git@github.com:hariharan-devarajan/dftracer.git - cd dftracer - # You can skip this for installing the dev branch. - # for latest stable version use master branch. - git checkout tags/ -b - pip install . +pip install pydftracer ``` ### From Github @@ -46,70 +40,77 @@ DFT_VERSION=dev pip install git+https://github.com/hariharan-devarajan/dftracer.git@${DFT_VERSION} ``` -For more build instructions check [here](https://dftracer.readthedocs.io/en/latest/build.html) - -Usage +### From source +```bash +git clone git@github.com:hariharan-devarajan/dftracer.git +cd dftracer +# You can skip this for installing the dev branch. +# for latest stable version use master branch. +git checkout tags/ -b +pip install . ``` - from dftracer.logger import dftracer, dft_fn - log_inst = dftracer.initialize_log(logfile=None, data_dir=None, process_id=-1) - dft_fn = dft_fn("COMPUTE") - - # Example of using function decorators - @dft_fn.log - def log_events(index): - sleep(1) - - # Example of function spawning and implicit I/O calls - def posix_calls(val): - index, is_spawn = val - path = f"{cwd}/data/demofile{index}.txt" - f = open(path, "w+") - f.write("Now the file has more content!") - f.close() - if is_spawn: - print(f"Calling spawn on {index} with pid {os.getpid()}") - log_inst.finalize() # This need to be called to correctly finalize DFTracer. - else: - print(f"Not calling spawn on {index} with pid {os.getpid()}") - - # NPZ calls internally calls POSIX calls. - def npz_calls(index): - # print(f"{cwd}/data/demofile2.npz") - path = f"{cwd}/data/demofile{index}.npz" - if os.path.exists(path): - os.remove(path) - records = np.random.randint(255, size=(8, 8, 1024), dtype=np.uint8) - record_labels = [0] * 1024 - np.savez(path, x=records, y=record_labels) - - def main(): - log_events(0) - npz_calls(1) - with get_context('spawn').Pool(1, initializer=init) as pool: - pool.map(posix_calls, ((2, True),)) - log_inst.finalize() - - - if __name__ == "__main__": - main() +For more build instructions check [here](https://dftracer.readthedocs.io/en/latest/build.html). + +## Usage + +```python +from dftracer.logger import dftracer, dft_fn +log_inst = dftracer.initialize_log(logfile=None, data_dir=None, process_id=-1) +dft_fn = dft_fn("COMPUTE") + +# Example of using function decorators +@dft_fn.log +def log_events(index): + sleep(1) + +# Example of function spawning and implicit I/O calls +def posix_calls(val): + index, is_spawn = val + path = f"{cwd}/data/demofile{index}.txt" + f = open(path, "w+") + f.write("Now the file has more content!") + f.close() + if is_spawn: + print(f"Calling spawn on {index} with pid {os.getpid()}") + log_inst.finalize() # This need to be called to correctly finalize DFTracer. + else: + print(f"Not calling spawn on {index} with pid {os.getpid()}") + +# NPZ calls internally calls POSIX calls. +def npz_calls(index): + # print(f"{cwd}/data/demofile2.npz") + path = f"{cwd}/data/demofile{index}.npz" + if os.path.exists(path): + os.remove(path) + records = np.random.randint(255, size=(8, 8, 1024), dtype=np.uint8) + record_labels = [0] * 1024 + np.savez(path, x=records, y=record_labels) + +def main(): + log_events(0) + npz_calls(1) + with get_context('spawn').Pool(1, initializer=init) as pool: + pool.map(posix_calls, ((2, True),)) + log_inst.finalize() + +if __name__ == "__main__": + main() ``` -For this example, as the DFTRACER_CPP_INIT do not pass log file or data dir, we need to set ``DFTRACER_LOG_FILE`` and ``DFTRACER_DATA_DIR``. -By default the DFTracer mode is set to FUNCTION. +For this example, as the `dftracer.initialize_log` do not pass `logfile` or `data_dir`, we need to set `DFTRACER_LOG_FILE` and `DFTRACER_DATA_DIR`. +By default the DFTracer mode is set to `FUNCTION`. Example of running this configurations are: -``` - - # the process id, app_name and .pfw will be appended by the profiler for each app and process. - # name of final log file is ~/log_file--.pfw - DFTRACER_LOG_FILE=~/log_file - # Colon separated paths for including for profiler - DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset:$PWD/data - # Enable profiler - DFTRACER_ENABLE=1 +```bash +# The process id, app_name and .pfw will be appended by the profiler for each app and process. +# The name of the final log file is ~/log_file--.pfw +DFTRACER_LOG_FILE=~/log_file +# Colon separated paths for including for profiler +DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset:$PWD/data +# Enable profiler +DFTRACER_ENABLE=1 ``` For more example check [Examples](https://dftracer.readthedocs.io/en/latest/examples.html). - diff --git a/cmake/configure_files/dftracer-config.cmake.build.in b/cmake/configure_files/dftracer-config.cmake.build.in index 043fbc95..d6911931 100644 --- a/cmake/configure_files/dftracer-config.cmake.build.in +++ b/cmake/configure_files/dftracer-config.cmake.build.in @@ -1,8 +1,7 @@ -# This will create IMPORTED targets for DFTRACER. The executables will be -# dftracer::-bin (e.g., dftracer::dftracer-bin) and the library will -# be dftracer::dftracer. +# This will create IMPORTED targets for dftracer. The executables will be +# the library will be dftracer::dftracer. -include("${CMAKE_CURRENT_LIST_DIR}/DFTRACERConfigVersion.cmake") +include("${CMAKE_CURRENT_LIST_DIR}/dftracer-config-version.cmake") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") @@ -10,7 +9,7 @@ list(APPEND CMAKE_MODULE_PATH "@EXTRA_CMAKE_MODULE_DIR@") #include(GNUInstallDirs) include(ExternalProject) -include(DFTRACERCMakeUtilities) +include(dftracer-utils) include(CMakePackageConfigHelpers) @@ -52,10 +51,30 @@ foreach (_DIR ${_TMP_LIBRARY_DIRS}) list(APPEND DFTRACER_LIBRARY_DIRS "${_LIBRARY_DIR}") endforeach (_DIR ${_TMP_LIBRARY_DIRS}) -if (NOT TARGET dftracer::dftracer) - include(${CMAKE_CURRENT_LIST_DIR}/DFTRACERTargets.cmake) -endif (NOT TARGET dftracer::dftracer) - -check_required_components(DFTRACER) - -set(DFTRACER_LIBRARIES dftracer::dftracer) \ No newline at end of file +if (NOT TARGET dftracer) + include(${CMAKE_CURRENT_LIST_DIR}/dftracer-targets.cmake) +endif (NOT TARGET dftracer) + + +find_package(brahma REQUIRED) +if (${brahma_FOUND}) + message(STATUS "[DFTRACER] found brahma at ${BRAHMA_INCLUDE_DIRS}") + include_directories(${BRAHMA_INCLUDE_DIRS}) + target_link_libraries(dftracer INTERFACE ${BRAHMA_LIBRARIES}) +else () + message(FATAL_ERROR "-- [DFTRACER] brahma is needed for ${PROJECT_NAME} build") +endif () + +find_package(yaml-cpp REQUIRED) +if (${yaml-cpp_FOUND}) + message(STATUS "[DFTRACER] found yaml-cpp at ${YAML_CPP_INCLUDE_DIR}") + include_directories(${YAML_CPP_INCLUDE_DIR}) + set(YAML_CPP_LIBRARY_DIR "${YAML_CPP_CMAKE_DIR}/../../") + target_link_libraries(dftracer INTERFACE -L${YAML_CPP_LIBRARY_DIR} ${YAML_CPP_LIBRARIES}) +else () + message(FATAL_ERROR "-- [DFTRACER] yaml-cpp is needed for ${PROJECT_NAME} build") +endif () + +check_required_components(dftracer) + +set(DFTRACER_LIBRARIES dftracer) \ No newline at end of file diff --git a/cmake/configure_files/dftracer-config.cmake.install.in b/cmake/configure_files/dftracer-config.cmake.install.in index 8fb0a4dd..e5acd910 100644 --- a/cmake/configure_files/dftracer-config.cmake.install.in +++ b/cmake/configure_files/dftracer-config.cmake.install.in @@ -1,8 +1,7 @@ -# This will create IMPORTED targets for DFTRACER. The executables will be -# DFTRACER::-bin (e.g., DFTRACER::dftracer-bin) and the library will -# be DFTRACER::dftracer. +# This will create IMPORTED targets for dftracer. The executables will be +# the library will be dftracer. -include("${CMAKE_CURRENT_LIST_DIR}/DFTRACERConfigVersion.cmake") +include("${CMAKE_CURRENT_LIST_DIR}/dftracer-config-version.cmake") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") @@ -10,7 +9,7 @@ list(APPEND CMAKE_MODULE_PATH "@EXTRA_CMAKE_MODULE_DIR@") #include(GNUInstallDirs) include(ExternalProject) -include(DFTRACERCMakeUtilities) +include(dftracer-utils) include(CMakePackageConfigHelpers) @@ -40,22 +39,41 @@ set(DFTRACER_HAS_STD_FSTREAM_FD @DFTRACER_HAS_STD_FSTREAM_FD@) @PACKAGE_INIT@ # Now actually import the DFTRACER target -set(_TMP_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIRS@") +set(_TMP_INCLUDE_DIRS "@DFTRACER_INSTALL_INCLUDE_DIR@") foreach (_DIR ${_TMP_INCLUDE_DIRS}) set_and_check(_INCLUDE_DIR "${_DIR}") list(APPEND DFTRACER_INCLUDE_DIRS "${_INCLUDE_DIR}") endforeach (_DIR "${_TMP_INCLUDE_DIRS}") -set(_TMP_LIBRARY_DIRS "@PACKAGE_LIB_INSTALL_DIR@") +set(_TMP_LIBRARY_DIRS "@DFTRACER_INSTALL_LIB_DIR@") foreach (_DIR ${_TMP_LIBRARY_DIRS}) set_and_check(_LIBRARY_DIR "${_DIR}") list(APPEND DFTRACER_LIBRARY_DIRS "${_LIBRARY_DIR}") endforeach (_DIR ${_TMP_LIBRARY_DIRS}) -if (NOT TARGET DFTRACER::dftracer) - include(${CMAKE_CURRENT_LIST_DIR}/DFTRACERTargets.cmake) -endif (NOT TARGET DFTRACER::dftracer) - -check_required_components(DFTRACER) - -set(DFTRACER_LIBRARIES DFTRACER::dftracer) \ No newline at end of file +if (NOT TARGET dftracer) + include(${CMAKE_CURRENT_LIST_DIR}/dftracer-targets.cmake) +endif (NOT TARGET dftracer) + +find_package(brahma REQUIRED) +if (${brahma_FOUND}) + message(STATUS "[DFTRACER] found brahma at ${BRAHMA_INCLUDE_DIRS}") + include_directories(${BRAHMA_INCLUDE_DIRS}) + target_link_libraries(dftracer INTERFACE ${BRAHMA_LIBRARIES}) +else () + message(FATAL_ERROR "-- [DFTRACER] brahma is needed for ${PROJECT_NAME} build") +endif () + +find_package(yaml-cpp REQUIRED) +if (${yaml-cpp_FOUND}) + message(STATUS "[DFTRACER] found yaml-cpp at ${YAML_CPP_INCLUDE_DIR}") + include_directories(${YAML_CPP_INCLUDE_DIR}) + set(YAML_CPP_LIBRARY_DIR "${YAML_CPP_CMAKE_DIR}/../../") + target_link_libraries(dftracer INTERFACE -L${YAML_CPP_LIBRARY_DIR} ${YAML_CPP_LIBRARIES}) +else () + message(FATAL_ERROR "-- [DFTRACER] yaml-cpp is needed for ${PROJECT_NAME} build") +endif () + +check_required_components(dftracer) + +set(DFTRACER_LIBRARIES dftracer) \ No newline at end of file diff --git a/cmake/configure_files/dftracer_config.hpp.in b/cmake/configure_files/dftracer_config.hpp.in index e3179997..acbfd186 100644 --- a/cmake/configure_files/dftracer_config.hpp.in +++ b/cmake/configure_files/dftracer_config.hpp.in @@ -5,6 +5,12 @@ #define DFTRACER_PACKAGE_VERSION @DFTRACER_PACKAGE_VERSION@ #cmakedefine DFTRACER_GIT_VERSION @DFTRACER_GIT_VERSION@ +#define DFTRACER_GET_VERSION(MAJOR, MINOR, PATCH) (MAJOR * 100000 + MINOR * 100 + PATCH) +#define DFTRACER_VERSION (DFTRACER_GET_VERSION @DFTRACER_VERSION@) +#define DFTRACER_VERSION_MAJOR (DFTRACER_VERSION / 100000) +#define DFTRACER_VERSION_MINOR ((DFTRACER_VERSION / 100) % 1000) +#define DFTRACER_VERSION_PATCH (DFTRACER_VERSION % 100) + /* Compiler used */ #cmakedefine CMAKE_BUILD_TYPE "@CMAKE_BUILD_TYPE@" diff --git a/dfanalyzer/dask/conf/corona.yaml b/dfanalyzer/dask/conf/corona.yaml index 19be2209..80e97829 100644 --- a/dfanalyzer/dask/conf/corona.yaml +++ b/dfanalyzer/dask/conf/corona.yaml @@ -3,6 +3,9 @@ config: conf_dir: ${DFTRACER_APP}/dfanalyzer/dask/conf run_dir: ${DFTRACER_APP}/dfanalyzer/dask/run_dir log_dir: ${DFTRACER_APP}/dfanalyzer/dask/logs +dask: + scheduler: dask scheduler + worker: dask worker job: num_nodes: 1 wall_time_min: 60 diff --git a/dfanalyzer/dask/conf/polaris.yaml b/dfanalyzer/dask/conf/polaris.yaml new file mode 100644 index 00000000..8dbbd9f8 --- /dev/null +++ b/dfanalyzer/dask/conf/polaris.yaml @@ -0,0 +1,26 @@ +config: + script_dir: ${DFTRACER_APP}/dfanalyzer/dask/scripts + conf_dir: ${DFTRACER_APP}/dfanalyzer/dask/conf + run_dir: ${DFTRACER_APP}/dfanalyzer/dask/run_dir + log_dir: ${DFTRACER_APP}/dfanalyzer/dask/logs +dask: + scheduler: dask-scheduler + worker: dask-worker +job: + num_nodes: 1 + wall_time_min: 01:00:00 + env_id: PBS_JOBID + queue: debug +scheduler: + cmd: qsub -l select=${DFTRACER_JOB_NUM_NODES} -l walltime=${DFTRACER_JOB_WALL_TIME_MIN} -l filesystems=home:eagle:grand -q ${DFTRACER_JOB_QUEUE} -A ${DFTRACER_ACCOUNT} -- + port: 11000 + kill: qdel +worker: + total_tasks: 16 + ppn: 16 + cmd: /opt/cray/pals/1.3.4/bin/mpiexec -n ${DFTRACER_WORKER_TOTAL_TASKS} --ppn ${DFTRACER_WORKER_PPN} + per_core: 1 + threads: 1 + local_dir: /dev/shm/$USER/dask-workspace + kill: qdel + connection_string: tcp://${DFTRACER_SCHEDULER_HOSTNAME}:${DFTRACER_SCHEDULER_PORT} diff --git a/dfanalyzer/dask/conf/quartz.yaml b/dfanalyzer/dask/conf/quartz.yaml new file mode 100644 index 00000000..a79dc1b4 --- /dev/null +++ b/dfanalyzer/dask/conf/quartz.yaml @@ -0,0 +1,23 @@ +config: + script_dir: ${DFTRACER_APP}/dfanalyzer/dask/scripts + conf_dir: ${DFTRACER_APP}/dfanalyzer/dask/conf + run_dir: ${DFTRACER_APP}/dfanalyzer/dask/run_dir + log_dir: ${DFTRACER_APP}/dfanalyzer/dask/logs +dask: + scheduler: dask scheduler + worker: dask worker +job: + num_nodes: 4 + wall_time_min: 60 + env_id: SLURM_JOB_ID +scheduler: + cmd: srun -N ${DFTRACER_JOB_NUM_NODES} -t ${DFTRACER_JOB_WALL_TIME_MIN} + port: 12005 + kill: scancel ${SLURM_JOB_ID} +worker: + ppn: 16 + cmd: srun -N ${DFTRACER_JOB_NUM_NODES} --ntasks-per-node=${DFTRACER_WORKER_PPN} + per_core: 1 + threads: 1 + local_dir: /dev/shm/dask-workspace + kill: scancel ${SLURM_JOB_ID} \ No newline at end of file diff --git a/dfanalyzer/dask/conf/ruby.yaml b/dfanalyzer/dask/conf/ruby.yaml index 1b1752f3..f3232b1f 100644 --- a/dfanalyzer/dask/conf/ruby.yaml +++ b/dfanalyzer/dask/conf/ruby.yaml @@ -3,6 +3,9 @@ config: conf_dir: ${DFTRACER_APP}/dfanalyzer/dask/conf run_dir: ${DFTRACER_APP}/dfanalyzer/dask/run_dir log_dir: ${DFTRACER_APP}/dfanalyzer/dask/logs +dask: + scheduler: dask scheduler + worker: dask worker job: num_nodes: 1 wall_time_min: 60 diff --git a/dfanalyzer/dask/scripts/start_dask_distributed.sh b/dfanalyzer/dask/scripts/start_dask_distributed.sh index 6072323f..73a35531 100755 --- a/dfanalyzer/dask/scripts/start_dask_distributed.sh +++ b/dfanalyzer/dask/scripts/start_dask_distributed.sh @@ -12,6 +12,12 @@ case $hostname in *"ruby"*) DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/ruby.yaml ;; + "quartz"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/quartz.yaml + ;; + "polaris"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/polaris.yaml + ;; esac if [[ "$DFTRACER_DASK_CONF_NAME" == "UNSET" ]]; then @@ -27,9 +33,33 @@ source ${DFTRACER_APP}/dfanalyzer/dask/scripts/utils.sh eval $(parse_yaml $DFTRACER_DASK_CONF_NAME DFTRACER_) source ${DFTRACER_ENV}/bin/activate +mkdir -p ${DFTRACER_CONFIG_LOG_DIR} +mkdir -p ${DFTRACER_CONFIG_RUN_DIR} + +rm -rf ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json -dask scheduler --scheduler-file ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json --port ${DFTRACER_SCHEDULER_PORT} > ${DFTRACER_CONFIG_LOG_DIR}/scheduler_${USER}.log 2>&1 & +${DFTRACER_DASK_SCHEDULER} --scheduler-file ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json --port ${DFTRACER_SCHEDULER_PORT} > ${DFTRACER_CONFIG_LOG_DIR}/scheduler_${USER}.log 2>&1 & scheduler_pid=$! echo $scheduler_pid > ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.pid -${DFTRACER_SCHEDULER_CMD} ${DFTRACER_CONFIG_SCRIPT_DIR}/start_dask_worker.sh ${DFTRACER_DASK_CONF_NAME} +file=${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json +timeout=30 # seconds to wait for timeout +SECONDS=0 # initialize bash's builtin counter + +until [ -s "$file" ] || (( SECONDS >= timeout )); do sleep 1; done + + +if test -f $file; +then + echo "Scheduler with $scheduler_pid is running" + # Do something knowing the pid exists, i.e. the process with $PID is running +else + echo "Scheduler with $scheduler_pid failed. Check the ${DFTRACER_CONFIG_LOG_DIR}/scheduler_${USER}.log file" + exit 1 +fi + +rm ${DFTRACER_CONFIG_RUN_DIR}/job_id_${USER}.pid + +${DFTRACER_SCHEDULER_CMD} ${DFTRACER_CONFIG_SCRIPT_DIR}/start_dask_worker.sh ${DFTRACER_DASK_CONF_NAME} ${hostname} + + diff --git a/dfanalyzer/dask/scripts/start_dask_worker.sh b/dfanalyzer/dask/scripts/start_dask_worker.sh index 6d3b689c..12c08299 100755 --- a/dfanalyzer/dask/scripts/start_dask_worker.sh +++ b/dfanalyzer/dask/scripts/start_dask_worker.sh @@ -1,6 +1,9 @@ #!/bin/bash +set -x + DFTRACER_DASK_CONF_NAME=$1 +DFTRACER_SCHEDULER_HOSTNAME=$2 source $HOME/.dftracer/configuration.sh export PYTHONPATH=${DFTRACER_APP}:${PYTHONPATH} @@ -9,13 +12,24 @@ export PYTHONPATH=${DFTRACER_APP}:${PYTHONPATH} source ${DFTRACER_APP}/dfanalyzer/dask/scripts/utils.sh eval $(parse_yaml $DFTRACER_DASK_CONF_NAME DFTRACER_) DFTRACER_JOB_ID=${!DFTRACER_JOB_ENV_ID} + +echo -n $DFTRACER_JOB_ID > ${DFTRACER_CONFIG_RUN_DIR}/job_id_${USER}.pid + source ${DFTRACER_ENV}/bin/activate echo "Activated Env" + +if [ "x${DFTRACER_WORKER_CONNECTION_STRING}" != "x" ]; then + dask_scheduler_conn=${DFTRACER_WORKER_CONNECTION_STRING} +else + dask_scheduler_conn="--scheduler-file ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json" +fi + while : do -${DFTRACER_WORKER_CMD} dask worker --scheduler-file ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.json \ +${DFTRACER_WORKER_CMD} ${DFTRACER_DASK_WORKER} ${dask_scheduler_conn} \ --local-directory ${DFTRACER_WORKER_LOCAL_DIR} \ --nworkers ${DFTRACER_WORKER_PER_CORE} --nthreads ${DFTRACER_WORKER_THREADS} > ${DFTRACER_CONFIG_LOG_DIR}/worker_${DFTRACER_JOB_ID}.log 2>&1 echo "Workers existed. Restarting in 1 second" sleep 1 done + diff --git a/dfanalyzer/dask/scripts/stop_dask_distributed.sh b/dfanalyzer/dask/scripts/stop_dask_distributed.sh index b72d207b..91932f08 100755 --- a/dfanalyzer/dask/scripts/stop_dask_distributed.sh +++ b/dfanalyzer/dask/scripts/stop_dask_distributed.sh @@ -3,11 +3,35 @@ set -x source $HOME/.dftracer/configuration.sh export PYTHONPATH=${DFTRACER_APP}:${PYTHONPATH} # This can be set using env variable or arguments to script. -DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/ruby.yaml + +hostname=`hostname` +DFTRACER_DASK_CONF_NAME="UNSET" + +case $hostname in + *"corona"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/corona.yaml + ;; + *"ruby"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/ruby.yaml + ;; + "quartz"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/quartz.yaml + ;; + "polaris"*) + DFTRACER_DASK_CONF_NAME=${DFTRACER_APP}/dfanalyzer/dask/conf/polaris.yaml + ;; +esac + +if [[ "$DFTRACER_DASK_CONF_NAME" == "UNSET" ]]; then + echo "UNSUPPORTED $hostname" + exit 1 +fi # This is start of every script. source ${DFTRACER_APP}/dfanalyzer/dask/scripts/utils.sh eval $(parse_yaml $DFTRACER_DASK_CONF_NAME DFTRACER_) -$DFTRACER_SCHEDULER_KILL -$DFTRACER_WORKER_KILL +kill -9 `cat ${DFTRACER_CONFIG_RUN_DIR}/scheduler_${USER}.pid` +export DFTRACER_JOB_ID=`cat ${DFTRACER_CONFIG_RUN_DIR}/job_id_${USER}.pid` +$DFTRACER_SCHEDULER_KILL $DFTRACER_JOB_ID +$DFTRACER_WORKER_KILL $DFTRACER_JOB_ID diff --git a/dfanalyzer/graph.py b/dfanalyzer/graph.py new file mode 100644 index 00000000..4c59948e --- /dev/null +++ b/dfanalyzer/graph.py @@ -0,0 +1,211 @@ + +import dask.dataframe as dd +import re +import os +import glob +import pandas as pd + + +class DFGrepInterference: + """ + This class provides methods to manage the graph based representation of IO traces for interference computation. + init parameters: + ddf (dask dataframe): dask dataframe (analyzer.events) for computing the metrices. + app_name (str): Application identifier. Used for reading/writing the checkpoint files. + cp_dir (str): Checkpoint directory path + existing (bool): If true, computation can be avoided and the instance can be loaded from checkpoint files for downstream analysis + """ + + def __init__(self, ddf=None, app_name="", cp_dir="", existing=False): + self.ddf = ddf + self.ddf_deg = None + self.inter = None + self.correlation = None + self.app_name = app_name + self.cp_dir = cp_dir + self.meta = { + 'id': str, + 'name': str, + 'pid': int, + 'size': int, + 'ts': int, + 'te': int, + 'mount_point': str, + 'dur': int, + 'trange': int, + 'deg': int + } + if existing: + self.read_checkpoint(id="inter", cp_dir=cp_dir) + self.read_checkpoint(id='deg_ddf', cp_dir=cp_dir) + else: + self.ddf = self.select_data_cols( + cols=['id', 'name', 'pid', 'size', 'ts', 'te', 'mount_point', 'dur', 'trange']) + + def select_data_cols(self, cols=[]): + ''' + returns the data with size > 0 and with selected columns + ''' + return self.ddf.query('size > 0')[cols] + + def get_degree(self): + """ + Calculates the degree for each event. The degree calculation is done on group of events with same mount point and within same timerange. + + get_deg calculates the degree for each group. It first sort the events according to start time, and for each events, look forward to determine if any events have overlapping time. If so, increase the degree by 1 for both events. + """ + + def get_deg(df_group): + df_group = df_group.sort_values(by='ts').reset_index( + drop=True) # check drop true + group_length = len(df_group) + degrees = [1]*group_length + # update degrees by looping throught the group + for row in range(group_length): + current_stop_time = df_group.at[row, 'te'] + # look for neighbors + for neigh in range(row+1, group_length): + neigh_start_time = df_group.at[neigh, 'ts'] + if (neigh_start_time < current_stop_time): + degrees[row] += 1 + degrees[neigh] += 1 + else: + break + # Add a new column 'deg' representing the count of overlaps + df_group['deg'] = degrees + return df_group + + self.ddf_deg = self.ddf.groupby(['mount_point', 'trange']).apply(get_deg, meta=self.meta).reset_index( + drop=True) # multiple trange column error while writing so had to drop + # self.ddf_deg.set_index(['id']) + + def get_interference(self): + ''' + calculate the interference factor for each events. + step1: calculate the duration of minimum degree for all size and mount point combination. + step2: calculate the Interference factor based on duration of the event and the duration of min degree event. + ''' + def dur_of_min_deg(ddf): + ''' + calculate the duration of minimum degree for all size and mount point combination. + ''' + ddf1 = ddf.copy() + list_deg = ddf1.groupby(["mount_point", "size"])[ + "deg"].min().compute() + agg_dict = {} + for deg in list_deg: + agg_dict[str(deg)] = min + ddf1[str(deg)] = 9223372036854775807 + ddf1[str(deg)] = ddf1[str(deg)].mask( + ddf1['deg'] == deg, ddf1['dur']) + return ddf1, agg_dict, list_deg + + def calculate_interference(ddf, agg_dict, list_deg): + ''' + calculate the Interference factor based on duration of the event and the duration of min degree event. + ''' + agg_dict["deg"] = min + val = ddf.groupby(['size', 'mount_point']).agg(agg_dict) + val['min_dur'] = 0 + for deg in list_deg: + val['min_dur'] = val['min_dur'].mask( + val['deg'].eq(deg), val[str(deg)]) + ddf2 = val.reset_index() + merge = ddf.merge(ddf2, on=['size', 'mount_point'], how='left', suffixes=('_caller', '_other'))[ + ['name', 'pid', 'size', 'ts', 'te', 'mount_point', 'dur', 'trange', 'deg_caller', 'deg_other', 'min_dur']] + merge['interference'] = merge['min_dur']/merge['dur'] + return merge + + dft1, agg_dict, list_deg = dur_of_min_deg(self.ddf_deg) + self.inter = calculate_interference( + dft1, agg_dict=agg_dict, list_deg=list_deg) + + def write_checkpoint(self, id, cp_dir): + ''' + write the datafame as parquet files + ''' + write_df = getattr(self, id) + # print(write_df.compute()) + # schema = {'id': int, 'name': str, 'pid': int, 'size': int, 'ts': int, 'te': int, 'mount_point': str, 'dur': int, 'trange': int, 'deg':int} + write_df.to_parquet(f"{cp_dir}/{self.app_name}", + name_function=lambda i: f'{id}-{i}.parquet') + + def read_checkpoint(self, id, cp_dir): + ''' + read the dataframe from checkpoint files + ''' + read_df = dd.read_parquet(f"{cp_dir}/{self.app_name}/{id}*.parquet") + setattr(self, id, read_df) + + +class DFGrepWorkflow: + """ + This class provides methods to represent the IO traces as workflow graphs. + init parameters: + """ + + def __init__(self, ddf=None, app_name="", trace_path=""): + self.ddf = ddf + self.app_name = app_name + self.trace_path = trace_path + + def select_cols(self, cols=[]): + return self.ddf[cols] + + def get_pid_map(self): + ''' + This function is designed for mummi traces to map pid with the application based on the filename + ''' + all_files = glob.glob(self.trace_path) + pid_map = {} + for file in all_files: + slices = os.path.basename(file).split('.') + if (len(slices) > 4): + pid_map[slices[3]] = slices[1] + return pid_map + + def create_workflow(self): + ''' + 1. Find number of times each file is prod/cons. And select the files that are both prod & cons atleast once + 2. Get the list of the files which are both produced and consumed and select only the events with these files + ''' + prod_cons = self.ddf.groupby('filename')['prod', 'cons'].sum() + prod_cons = prod_cons.query('prod > 0 and cons > 0').reset_index() + filelist = prod_cons.filename.unique().compute() + selected_events = self.ddf[self.ddf.filename.isin(filelist)] + selected_events_sum = selected_events.groupby(['filename', 'pid']).agg( + {'prod': 'sum', 'cons': 'sum', 'ts': 'min'}).reset_index() + merged = selected_events_sum.merge( + prod_cons, on=["filename"], how='left', suffixes=['_pid', '_fid']) + final = merged.query( + 'not (prod_pid == prod_fid and cons_pid == cons_fid)') + return final + + def create_graph_df(self, df, pid_map): + ''' + Function creates soruce and destination data for plotting the graph. This version is currently designed for mummi workflow + ''' + def get_base_filename(path): + return os.path.basename(path) + + def process_row(row): + filename = re.sub("\d+", "x", row['filename']) + filename = "f_"+get_base_filename(filename) + # filename = row['filename'] + pid = pid_map[str(row['pid'])] if str( + row['pid']) in pid_map else str(row['pid']) + pid = "p_"+pid + prod = row['prod_pid'] + cons = row['cons_pid'] + + if prod == 0: + return [{'src': filename, 'dest': pid, 'wt': row['ts']}] + + elif cons == 0: + return [{'src': pid, 'dest': filename, 'wt': row['ts']}] + elif prod > 0 and cons > 0: + return [{'src': filename, 'dest': pid, 'wt': row['ts']}, {'src': pid, 'dest': filename, 'wt': row['ts']}] + + graph_df = pd.DataFrame([item for sublist in df.apply( + process_row, axis=1) for item in sublist]) + return graph_df diff --git a/dfanalyzer/graph_visualization/cystyles.json b/dfanalyzer/graph_visualization/cystyles.json new file mode 100644 index 00000000..a8317b7a --- /dev/null +++ b/dfanalyzer/graph_visualization/cystyles.json @@ -0,0 +1,139 @@ +{ + "default": [ + { + "selector": "nodes", + "style": { + "font-family": "helvetica", + "width": "70px", + "height": "70px", + "label": "data(id)", + "font-size": "15px", + "font-color": "brown", + "text-valign": "center", + "text-halign": "center", + "background-color": "purple" + } + }, + { + "selector": "edges", + "style": { + "font-family": "helvetica", + "font-size": "2px", + "line-color": "pink" + } + }, + { + "selector": "node[degree>2]", + "style": { + "width": "70px", + "height": "70px", + "font-size": "15px", + "font-family": "helvetica", + "background-color": "red" + } + }, + { + "selector": "node[bipartite=0]", + "style": { + "width": "70px", + "height": "70px", + "font-family": "helvetica", + "background-color": "blue" + } + }, + { + "selector": "node[degree<4]", + "style": { + "width": "70px", + "height": "70px", + "font-size": "15px", + "color": "white", + "font-family": "helvetica" + } + }, + { + "selector": "edge[weight>5000]", + "style": { + "font-family": "helvetica", + "line-color": "green" + } + } + ], + "direct": [ + { + "selector": "node", + "style": { + "content": "data(label)", + "text-opacity": 0.8, + "text-valign": "center", + "text-halign": "center", + "background-color": "pink" + } + }, + { + "selector": "edge", + "style": { + "curve-style": "bezier", + "target-arrow-shape": "triangle", + "line-color": "#9dbaea", + "target-arrow-color": "#9dbaea", + "width": 2 + } + }, + { + "selector": "node[id ^= \"f\"]", + "style": { + "shape": "rectangle" + } + }, + { + "selector": "edge.bidirectional", + "style": { + "curve-style": "unbundled-bezier", + "control-point-distances": [ + 20 + ], + "control-point-weights": [ + 0.5 + ], + "target-arrow-shape": "triangle", + "source-arrow-shape": "triangle", + "line-color": "#ff0000", + "target-arrow-color": "#ff0000", + "source-arrow-color": "#ff0000", + "width": 2 + } + } + ], + "directed": [ + { + "selector": "nodes", + "style": { + "font-family": "helvetica", + "width": "50px", + "height": "50px", + "label": "data(id)", + "font-size": "10px", + "font-color": "white", + "text-valign": "center", + "text-halign": "center", + "background-color": "pink" + } + }, + { + "selector": "edge", + "style": { + "curve-style": "bezier", + "target-arrow-shape": "triangle", + "target-arrow-color": "#9dbaea" + } + }, + { + "selector": "edge[back]", + "style": { + "line-color": "red", + "target-arrow-color": "red" + } + } + ] +} \ No newline at end of file diff --git a/dfanalyzer/graph_visualization/cytoscape.py b/dfanalyzer/graph_visualization/cytoscape.py new file mode 100644 index 00000000..6f2bd37a --- /dev/null +++ b/dfanalyzer/graph_visualization/cytoscape.py @@ -0,0 +1,176 @@ +import json +import networkx as nx +import ipycytoscape +import ipywidgets as widgets +from IPython.display import display +import os +# import ipycytoscape +# import networkx as nx + + +class CytoGraph: + + def get_json(self, graph, degree_dict = {}): + ''' + Input: networkx graph, dict (key = node_id and value = degree) + Returns: json_data (format needed for ipycytoscape visualization) + ''' + # for pid_tid -> file bipartite graphs + json_data = { + 'nodes': [{'data': {'id': str(node[0]), 'degree': int(degree_dict[node[0]])}} for node in graph.nodes(data=True)], + 'edges': [{'data': {'source': str(edge[0]), 'target': str(edge[1]), 'directed': True}} for edge in graph.edges(data=True)]} + + return json_data + + + def get_rich_json(self, graph, degree_dict = {}): + ''' + Get json with Nodes and Edges attributes + Input: networkx graph, dict (key = node_id and value = degree) + Returns: json_data (format needed for ipycytoscape visualization) + ''' + # for pid_tid -> file bipartite graphs + # json_data = { + # 'nodes': [{'data': {'id': str(node[0]),'degree': int(degree_dict[str(node[0])]), 'bipartite': int(node[1]['bipartite'])}} for node in graph.nodes(data=True)], + # 'edges': [{'data': {'source': str(edge[0]), 'target': str(edge[1]), 'weight': int(edge[2]['weight'])}} for edge in graph.edges(data=True)]} + + # # for unweighted unipartite graphs + # json_data = { + # 'nodes': [{'data': {'id': str(node[0]), 'degree': int(degree_dict[node[0]])}} for node in graph.nodes(data=True)], + # 'edges': [{'data': {'source': str(edge[0]), 'target': str(edge[1]), 'directed': True}} for edge in graph.edges(data=True)]} + + # for directed + # Prepare nodes and edges data + nodes = [{'data': {'id': str(node), 'label': str(node), 'outdegree': int(graph.out_degree(node))}} for node in graph.nodes()] + edges = [] + for edge in graph.edges(data=True): + u , v, t = edge[0], edge[1], edge[2]['wt'] + edge_data = {'data': {'source': str(u), 'target': str(v), 'wt': int(t)}} + if graph.has_edge(v, u): + edge_data['classes'] = 'bidirectional' + edges.append(edge_data) + + json_data = {'nodes': nodes, 'edges': edges} + + return json_data + + def get_style(self, style_name): + current_dir = os.path.dirname(os.path.abspath(__file__)) + styles_file = os.path.join(current_dir, 'cystyles.json') + with open(styles_file, 'r') as f: + styles = json.load(f) + return styles.get(style_name, []) + + def temporal_view(self,df): + ''' + Create ipywidget visualization for visualizing prev and next timestamp graphs using trange as tiemstamp + ''' + def get_graphs(df, each): + g_data = df.query(f'trange <= {each}') + G = nx.from_pandas_edgelist(g_data, source='src', target='dest', edge_attr= 'wt', create_using=nx.DiGraph()) + json_data = self.get_rich_json(G) + return json_data + + views_t = sorted(df.trange.unique()) + views = [get_graphs(df,each) for each in views_t] + + # Initialize ipycytoscape graph widgets with the first views + graph1 = ipycytoscape.CytoscapeWidget() + graph1.graph.add_graph_from_json(views[0]) + graph1.set_layout(name = 'dagre') + graph1.set_style(self.get_style('direct')) + + graph2 = ipycytoscape.CytoscapeWidget() + graph2.graph.add_graph_from_json(views[1]) + graph2.set_layout(name = 'dagre') + graph2.set_style(self.get_style('direct')) + + # Navigation buttons + button_selected = widgets.Button(description="Prev") + button_next = widgets.Button(description="Next") + output = widgets.Output() + + # Index to keep track of current view + current_view = 0 + + def update_graph_views(): + nonlocal current_view + graph1.graph.clear() + graph1.graph.add_graph_from_json(views[current_view]) + graph1.set_layout(name = 'dagre') + graph1.set_style(self.get_style('direct')) + + next_view = (current_view + 1) % len(views) + graph2.graph.clear() + graph2.graph.add_graph_from_json(views[next_view]) + graph2.set_layout(name = 'dagre') + graph2.set_style(self.get_style('direct')) + + with output: + output.clear_output() + print(f"View {current_view + 1} / {len(views)}") + + def on_prev_clicked(b): + # update_graph_views() + nonlocal current_view + current_view = (current_view - 1) % len(views) + update_graph_views() + + def on_next_clicked(b): + nonlocal current_view + current_view = (current_view + 1) % len(views) + update_graph_views() + + button_selected.on_click(on_prev_clicked) + button_next.on_click(on_next_clicked) + + # Display widgets side by side + buttons_box = widgets.HBox([button_selected, button_next]) + graphs_box = widgets.HBox([graph1, graph2]) + display(widgets.VBox([buttons_box, graphs_box, output])) + + + +class GraphFunctions: + def create_nx_graph(self, events): + ''' + Input: dataframe to create graph from + Returns: networkx graph with node as each row in in df, and edge between two nodes exists if there exists any overlapping time. + ''' + graph = nx.Graph() + df = events.sort_values('ts').reset_index() + event_size = len(events) + for row in range(event_size): + current_stop_time = df.at[row, 'te'] + for neigh in range(row+1, event_size): + neigh_start_time = df.at[neigh, 'ts'] + if (neigh_start_time <= current_stop_time): + graph.add_edge(df.at[row, 'id'], df.at[neigh, 'id']) + else: + break + return graph + + def visualize_graph(self, nx_graph, cyto_obj): + ''' + Input: networkx graph and cytoscape + Visualize graph using json data + ''' + app_view = ipycytoscape.CytoscapeWidget() + degree_dict = dict(nx_graph.degree) + json_g = cyto_obj.get_json(nx_graph, degree_dict) + app_view.graph.add_graph_from_json(json_g) + # graph_view.set_layout(name="circle") + # app_view.set_layout(name="concentric") + app_view.set_layout(name="dagre",spacingFactor= 1.5,rankDir= "LR", fit=True) + app_view.set_style(cyto_obj.get_style('default')) + return app_view + + def visualize_nxgraph(self, nx_graph, cyto_obj): + ''' + Visualize from networkx graph + ''' + app_view = ipycytoscape.CytoscapeWidget() + app_view.graph.add_graph_from_networkx(nx_graph, directed=True) + app_view.set_layout(name="dagre") + app_view.set_style(cyto_obj.get_style('directed')) + return app_view diff --git a/dfanalyzer/main.py b/dfanalyzer/main.py index 396d78aa..909c6fb0 100644 --- a/dfanalyzer/main.py +++ b/dfanalyzer/main.py @@ -43,10 +43,10 @@ def __init__(self): self.log_file = "dfanalyzer.log" self.dask_scheduler = None self.index_dir = None - self.time_approximate = False + self.time_approximate = True self.slope_threshold = 45 - self.time_granularity = 1e3 - self.skip_hostname = False + self.time_granularity = 1e6 + self.skip_hostname = True self.conditions = None dft_configuration = DFTConfiguration() @@ -158,12 +158,13 @@ def load_indexed_gzip_files(filename, start, end): logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]") return json_lines -def load_objects(line, fn, time_granularity, time_approximate, condition_fn): +def load_objects(line, fn, time_granularity, time_approximate, condition_fn, load_data): d = {} if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" : val = {} try: - val = json.loads(line) + unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line]) + val = json.loads(unicode_line) logging.debug(f"Loading dict {val}") if "name" in val: d["name"] = val["name"] @@ -180,7 +181,7 @@ def load_objects(line, fn, time_granularity, time_approximate, condition_fn): d["trange"] = int(((val["ts"] + val["dur"])/2.0) / time_granularity) d.update(io_function(val, d, time_approximate,condition_fn)) if fn: - d.update(fn(val, d, time_approximate,condition_fn)) + d.update(fn(val, d, time_approximate,condition_fn, load_data)) logging.debug(f"built an dictionary for line {d}") except ValueError as error: logging.error(f"Processing {line} failed with {error}") @@ -229,10 +230,22 @@ def io_function(json_object, current_dict, time_approximate,condition_fn): d["hostname"] = json_object["args"]["hostname"] if "POSIX" == json_object["cat"] and "ret" in json_object["args"]: - if "write" in json_object["name"]: + if json_object["name"] == "write": d["size"] = int(json_object["args"]["ret"]) - elif "read" in json_object["name"] and "readdir" not in json_object["name"]: + elif json_object["name"] == "read": d["size"] = int(json_object["args"]["ret"]) + elif json_object["name"] == "fwrite": + d["size"] = 1 + if "ret" in json_object["args"]: + d["size"] *= int(json_object["args"]["ret"]) + if "size" in json_object["args"]: + d["size"] *= int(json_object["args"]["size"]) + elif json_object["name"] == "fread": + d["size"] = 1 + if "ret" in json_object["args"]: + d["size"] *= int(json_object["args"]["ret"]) + if "size" in json_object["args"]: + d["size"] *= int(json_object["args"]["size"]) else: if "image_size" in json_object["args"]: d["size"] = int(json_object["args"]["image_size"]) @@ -340,8 +353,13 @@ def human_format_time(num): class DFAnalyzer: - def __init__(self, file_pattern, load_fn=None, load_cols={}): + def __init__(self, file_pattern, load_fn=None, load_cols={}, load_data = {}): + self.conf = get_dft_configuration() + if self.conf.dask_scheduler: + client = Client.current() + if len(load_data)>0: + client.scatter(load_data) file_pattern = glob(file_pattern) all_files = [] pfw_pattern = [] @@ -386,13 +404,15 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}): gz_bag = json_lines.map(load_objects, fn=load_fn, time_granularity=self.conf.time_granularity, time_approximate=self.conf.time_approximate, - condition_fn=self.conf.conditions).filter(lambda x: "name" in x) + condition_fn=self.conf.conditions, + load_data=load_data).filter(lambda x: "name" in x) main_bag = None if len(pfw_pattern) > 0: pfw_bag = dask.bag.read_text(pfw_pattern).map(load_objects, fn=load_fn, time_granularity=self.conf.time_granularity, time_approximate=self.conf.time_approximate, - condition_fn=self.conf.conditions).filter(lambda x: "name" in x) + condition_fn=self.conf.conditions, + load_data=load_data).filter(lambda x: "name" in x) if len(pfw_gz_pattern) > 0 and len(pfw_pattern) > 0: main_bag = dask.bag.concat([pfw_bag, gz_bag]) elif len(pfw_gz_pattern) > 0: @@ -411,10 +431,11 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}): logging.debug(f"Number of partitions used are {self.n_partition}") self.events = events.repartition(npartitions=self.n_partition).persist() _ = wait(self.events) - self.events['ts'] = self.events['ts'] - self.events['ts'].min() - self.events['te'] = self.events['ts'] + self.events['dur'] - self.events['trange'] = self.events['ts'] // self.conf.time_granularity + self.events['ts'] = (self.events['ts'] - self.events['ts'].min()).astype('uint64[pyarrow]') + self.events['te'] = (self.events['ts'] + self.events['dur']).astype('uint64[pyarrow]') + self.events['trange'] = (self.events['ts'] // self.conf.time_granularity).astype('uint16[pyarrow]') self.events = self.events.persist() + _ = wait(self.events) else: logging.error(f"Unable to load Traces") @@ -471,7 +492,7 @@ def _calculate_time(self): grouped_df[["only_app_io"]].apply(size_portion, col="only_app_io", axis=1).sum(), grouped_df[["only_app_compute"]].apply(size_portion, col="only_app_compute", axis=1).sum(), ) - logging.info(f"Approximate {self.conf.time_approximate} {total_time}, {total_io_time}, {total_compute_time}, {total_app_io_time}, \ + logging.debug(f"Approximate {self.conf.time_approximate} total_time:{total_time}, {total_io_time}, {total_compute_time}, {total_app_io_time}, \ {only_io}, {only_compute}, {only_app_io}, {only_app_compute}") return total_time, total_io_time, total_compute_time, total_app_io_time, \ only_io, only_compute, only_app_io, only_app_compute @@ -667,6 +688,7 @@ def setup_dask_cluster(): client = Client(cluster) # Connect to distributed cluster and override default logging.info(f"Initialized Client with {conf.workers} workers and link {client.dashboard_link}") + def main(): args = parse_args() setup_logging() diff --git a/dfanalyzer/plots.py b/dfanalyzer/plots.py index 1a0f92f7..b8d2e732 100644 --- a/dfanalyzer/plots.py +++ b/dfanalyzer/plots.py @@ -5,6 +5,7 @@ import pandas as pd from matplotlib import ticker from typing import Literal, Tuple +import seaborn as sns TIME_COLS = ['io_time', 'app_io_time'] @@ -28,6 +29,7 @@ def time_bw_timeline( y2label: str = 'Bandwidth', x_num_ticks: int = 10, y_num_ticks: int = 5, + y_axis_formatter = ticker.FuncFormatter(lambda x, pos: '{:.0f}'.format(x)), ): size_denom = 1024 y2label_suffix = 'KB/s' @@ -52,7 +54,8 @@ def _set_bw(df: pd.DataFrame): .reset_index() \ .map_partitions(_set_bw) \ .compute() \ - .assign(seconds=self._assign_seconds) + .assign(seconds=self._assign_seconds) \ + .sort_values("seconds") fig, ax1 = plt.subplots(figsize=figsize) if time_col == "io_time": @@ -81,12 +84,11 @@ def _set_bw(df: pd.DataFrame): ax1.yaxis.set_major_formatter(ticker.FuncFormatter( - lambda x, pos: '{:.0f}'.format(x/1e6))) + lambda x, pos: '{:.1f}'.format(x/1e6))) ax1.set_xlabel(xlabel) ax1.set_ylabel(y1label) - ax2.yaxis.set_major_formatter(ticker.FuncFormatter( - lambda x, pos: '{:.1f}'.format(x))) + ax2.yaxis.set_major_formatter(y_axis_formatter) if has_y2: ax2.set_ylabel(f"{y2label} ({y2label_suffix})") @@ -131,6 +133,7 @@ def xfer_size_timeline( ylabel: str = 'Transfer Size', x_num_ticks: int = 10, y_num_ticks: int = 5, + y_axis_formatter = ticker.FuncFormatter(lambda x, pos: '{:.0f}'.format(x)), ): xfer_col = 'xfer' @@ -152,7 +155,8 @@ def _set_xfer_size(df: pd.DataFrame): .query("phase == 2") \ .map_partitions(_set_xfer_size) \ .compute() \ - .assign(seconds=self._assign_seconds) + .assign(seconds=self._assign_seconds) \ + .sort_values("seconds") fig, ax = plt.subplots(figsize=figsize) @@ -166,8 +170,7 @@ def _set_xfer_size(df: pd.DataFrame): ax.yaxis.set_major_locator(ticker.LinearLocator(y_num_ticks)) # ax1.yaxis.set_major_formatter(ticker.FuncFormatter( # lambda x, pos: '{:.0f}'.format(x/1e6))) - ax.yaxis.set_major_formatter(ticker.FuncFormatter( - lambda x, pos: '{:.1f}'.format(x))) + ax.yaxis.set_major_formatter(y_axis_formatter) # ax.get_legend().remove() ax.minorticks_on() @@ -223,3 +226,128 @@ def _create_timeline(events: dd.DataFrame): }).reset_index().set_index("trange", sorted=True) return timeline + + +# plots used for graph based methods +class GrepIOPlots: + def histogram(self,data,x_label): + # Create the plot with logarithmic y-axis and linear x-axis + plt.hist(data, bins=23, color='skyblue', edgecolor='black') + plt.yscale('log') # Set logarithmic scale for the y-axis + # Add labels and title + plt.xlabel(x_label) + plt.ylabel('Frequency (log scale)') # Adjust the label for the y-axis + plt.title('Histogram of '+x_label) + # Show the plot + plt.show() + + def bar_graph(self,data,xlabel): + ''' + Example: + value_counts = analyzer_mummi1.events['Name'].value_counts().compute() + bar_graph(value_counts,"Name") + ''' + # Extract values and counts + values = data.index.tolist() + counts = data.values.tolist() + # Create a bar graph + plt.figure(figsize=(10, 6)) + plt.bar(values, counts) + # Add numerical values on the bars + for i, (xi, yi) in enumerate(zip(values, counts)): + plt.text(xi, yi, f'{yi:.2f}', ha='center', va='bottom',fontsize=6) + # Add labels and title + plt.xlabel('Values') + plt.ylabel('Counts') + plt.title("Bar Graph of "+xlabel+" counts") + # Rotate x-axis labels if needed + plt.xticks(rotation=45) + # Show the plot + plt.show() + + def line_plot(self, ddf): + ''' + Function: line_graph (Divides the sorted dataframe into 10) + Usage: + line_plot(sorted_events) + ''' + idx = np.linspace(0, len(ddf)-1,10, dtype=int) + ts = [ddf.compute().iloc[k]['ts'] for k in idx] + + plt.plot(ts,idx, linestyle='--',marker= 'o') + # Set logarithmic scales for both axes + # plt.xscale('log') + # plt.yscale('log') + # Add labels and title + plt.xlabel('Time (ts)') + plt.ylabel('Index in Dataframe') + plt.title('Index vs Time') + plt.show() + + def hmap(self, mp=None, df=None, size_bins=[], size_labels=[], degree_bins=[], degree_labels = []): + # df= inter.compute() + if mp: + df= df[df['mount_point']==mp] + else: + mp = "All Mount Points" + # size_bins, size_labels, degree_bins, degree_labels = binned_category() + df['size_category'] = pd.cut(df['size'], bins=size_bins, labels=size_labels, right=False) + df['deg_category'] = pd.cut(df['deg_caller'], bins=degree_bins, labels=degree_labels, right=False) + + df['interference'] = df['interference'].astype(float) + + all_combinations = pd.MultiIndex.from_product([df['size_category'].unique(), df['deg_category'].unique()], names=['size_category', 'deg_category']) + merged = pd.merge(df, pd.DataFrame(index=all_combinations).reset_index(), on=['size_category', 'deg_category'], how='left') + grouped = merged.groupby(['size_category', 'deg_category']).agg({'interference': 'mean'}).reset_index() + + pivot_table = grouped.pivot(index='size_category', columns='deg_category', values='interference').fillna(-0.5) + # Create heatmap using seaborn + plt.figure(figsize=(8, 6)) + pivot_table = 1/pivot_table + sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='.1f', linewidths=.5) + plt.title(f'Average Interference Heatmap {mp}') + plt.xlabel('Degrees') + plt.ylabel('Sizes') + plt.show() + + def correlation(self, ddf, group_col = '', col_a = '', col_b = '' ): + grouped = ddf.groupby(group_col).apply( + lambda x: x[col_a].corr(x[col_b]), + meta=('correlation', 'f8') + ).compute() + # grouped = ddf.groupby('trange').apply( + # lambda x: x['dur'].corr(x['interference']), + # meta=('correlation', 'f8') + # ).compute() + # 1. Binning the trange values + bin_size = 50 # Adjust this value as needed + grouped_binned = grouped.groupby(grouped.index // bin_size).mean() + plt.figure(figsize=(15, 6)) + plt.plot(grouped_binned.index * bin_size, grouped_binned.values, marker='o') + plt.title('Average Correlation ({col_a} and {col_b})') + plt.xlabel(f'{group_col} (binned)') + plt.ylabel('Correlation') + plt.grid(True, linestyle='--', alpha=0.7) + plt.tight_layout() + plt.show() + + def correlation_plot(self, grouped, bin_size = 50, group_col = '', col_a = '', col_b = '' ): + # grouped = ddf.groupby(group_col).apply( + # lambda x: x[col_a].corr(x[col_b]), + # meta=('correlation', 'f8') + # ).compute() + # grouped = ddf.groupby('trange').apply( + # lambda x: x['dur'].corr(x['interference']), + # meta=('correlation', 'f8') + # ).compute() + # 1. Binning the trange values + # bin_size = 50 # Adjust this value as needed + grouped_binned = grouped.groupby(grouped.index // bin_size).mean() + plt.figure(figsize=(15, 6)) + plt.plot(grouped_binned.index * bin_size, grouped_binned.values, marker='o') + plt.title(f'Average Correlation ({col_a} and {col_b})') + plt.xlabel(f'{group_col} (binned)') + plt.ylabel('Correlation') + plt.grid(True, linestyle='--', alpha=0.7) + plt.tight_layout() + plt.show() \ No newline at end of file diff --git a/dftracer/logger.py b/dftracer/logger.py index 1f7c4ec2..4a0163cd 100644 --- a/dftracer/logger.py +++ b/dftracer/logger.py @@ -105,6 +105,13 @@ def finalize(self): logging.debug(f"logger.finalize") self.logger.finalize() +def get_default_args(func): + signature = inspect.signature(func) + return { + k: v.default + for k, v in signature.parameters.items() + if v.default is not inspect.Parameter.empty + } class dft_fn(object): @@ -182,17 +189,19 @@ def wrapper(*args, **kwargs): self._arguments["image_size"] = str(args[0].image_size) if hasattr(args[0], "image_idx"): self._arguments["image_idx"] = str(args[0].image_idx) - for name, value in zip(arg_names[1:], kwargs): - if hasattr(args, name): - setattr(args, name, value) - if name == "epoch": - self._arguments["epoch"] = str(value) - elif name == "image_idx": - self._arguments["image_idx"] = str(value) - elif name == "image_size": - self._arguments["image_size"] = str(value) - elif name == "step": - self._arguments["image_size"] = str(value) + full_args = dict(zip(arg_names[1:], args[1:])) + full_args.update(kwargs) + full_args.update(get_default_args(func)) + + for name, value in full_args.items(): + if name == "epoch": + self._arguments["epoch"] = str(value) + elif name == "image_idx": + self._arguments["image_idx"] = str(value) + elif name == "image_size": + self._arguments["image_size"] = str(value) + elif name == "step": + self._arguments["image_size"] = str(value) start = dftracer.get_instance().get_time() dftracer.get_instance().enter_event() @@ -261,6 +270,7 @@ def new_init(*args, **kwargs): if DFTRACER_ENABLE: arg_values = dict(zip(arg_names[1:], args)) arg_values.update(kwargs) + arg_values.update(get_default_args(init)) if "epoch" in arg_values: self._arguments["epoch"] = str(arg_values["epoch"]) elif "image_idx" in arg_values: diff --git a/docs/api.rst b/docs/api.rst index 8f50098d..6f764b38 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -56,23 +56,23 @@ ENV Variables supported ================================ ====== =========================================================================== Environment Variable Type Description ================================ ====== =========================================================================== - DFTRACER_CONFIGURATION STRING PATH to the yaml configuration - DFTRACER_ENABLE INT Enable or Disable DFTracer (default 0). - DFTRACER_INIT STRING DFTracer Mode FUNCTION/PRELOAD (default FUNCTION). - For Hybrid use PRELOAD mode. - DFTRACER_LOG_FILE STRING PATH To log file. In this case process id and app name is appended to file. - DFTRACER_DATA_DIR STRING Colon separated paths that will be traced for I/O accesses by profiler. - For tracing all directories use the string "all" (not recommended). - DFTRACER_INC_METADATA INT Include or exclude metadata (default 0) - DFTRACER_SET_CORE_AFFINITY INT Include or exclude core affinity (default 0). - DFTRACER_INC_METADATA needs to be enabled. - DFTRACER_GOTCHA_PRIORITY INT PRIORITY of DFTracer in GOTCHA (default: 1). - DFTRACER_LOG_LEVEL STRING Logging level within DFTracer ERROR/WARN/INFO/DEBUG (default ERROR). - DFTRACER_DISABLE_IO STRING Disable automatic binding of all I/O calls. - DFTRACER_DISABLE_POSIX STRING Disable automatic binding of POSIX I/O calls. - DFTRACER_DISABLE_STDIO STRING Disable automatic binding of STDIO I/O calls. - DFTRACER_TRACE_COMPRESSION INT Enable trace compression (default 1) - DFTRACER_DISABLE_TIDS INT Disable tracing of thread ids (default 0). + DFTRACER_CONFIGURATION STRING PATH to the yaml configuration + DFTRACER_ENABLE INT Enable or Disable DFTracer (default 0). + DFTRACER_INIT STRING DFTracer Mode FUNCTION/PRELOAD (default FUNCTION). + For Hybrid use PRELOAD mode. + DFTRACER_LOG_FILE STRING PATH To log file. In this case process id and app name is appended to file. + DFTRACER_DATA_DIR STRING Colon separated paths that will be traced for I/O accesses by profiler. + For tracing all directories use the string "all" (not recommended). + DFTRACER_INC_METADATA INT Include or exclude metadata (default 0) + DFTRACER_SET_CORE_AFFINITY INT Include or exclude core affinity (default 0). + DFTRACER_INC_METADATA needs to be enabled. + DFTRACER_GOTCHA_PRIORITY INT PRIORITY of DFTracer in GOTCHA (default: 1). + DFTRACER_LOG_LEVEL STRING Logging level within DFTracer ERROR/WARN/INFO/DEBUG (default ERROR). + DFTRACER_DISABLE_IO STRING Disable automatic binding of all I/O calls. + DFTRACER_DISABLE_POSIX STRING Disable automatic binding of POSIX I/O calls. + DFTRACER_DISABLE_STDIO STRING Disable automatic binding of STDIO I/O calls. + DFTRACER_TRACE_COMPRESSION INT Enable trace compression (default 1) + DFTRACER_DISABLE_TIDS INT Disable tracing of thread ids (default 0). ================================ ====== =========================================================================== ---------------------------------------- @@ -126,6 +126,7 @@ Function Profiling To profile a function, add the wrapper ``DFTRACER_CPP_FUNCTION`` at the start of the function .. code-block:: c + void foo() { DFTRACER_CPP_FUNCTION(); sleep(1); diff --git a/docs/bash_utilities.rst b/docs/bash_utilities.rst new file mode 100644 index 00000000..c5ec4edb --- /dev/null +++ b/docs/bash_utilities.rst @@ -0,0 +1,90 @@ +======================== +Bash Utility scripts +======================== + +This section describes the bash utilities that are compatible with DFTracer logs + +---------- + +------------------ +Handling .pfw files +------------------ + +The DFTracer format with extension .pfw is uncompressed file which can be viewed using the following utilities. + +1. `vim` : Edit .pfw files +2. `cat`, `head`, or `tail`: view portion of the pfw files + + +---------------------- +Handling .pfw.gz files +---------------------- + +The DFtracer compressed format with .pfw extension can be first decompressed using gzip and then piped to the above .pfw utilities. + +.. code-block:: bash + + gzip -c -d `echo *.gz` | head + +-------------------- +Extracting JSON data +-------------------- + +Once the uncompressed data is parsed. The JSON utility `jq` can be used to parse args. + +In each case we have to remove the first `[` which has been added to support perfetto ui. + +For uncompressed files +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '.' + + +For compressed files +.. code-block:: bash + + gzip -c -d `echo *.gz` | grep -i "[^#[]" | jq -c '.' + +We can extract specific fields from these JSON lines as follows + +1. `jq -c '.name'`: extracts all the names of events +2. `jq -c '.cat'`: extracts all the category of events +3. `jq -c '.args.hostname'`: extracts the fields from extra args like hostname in this case. + +Useful querying using jq +************************ + +Extract unique functions with their counts from traces. + +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '.name' | sort | uniq -c + +Extract unique categories with their counts from traces. + +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '.cat' | sort | uniq -c + +Extract unique process id and thread id combination with their counts from traces. + +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '"\(.pid) \(.tid)"' | sort | uniq -c + +Extract min timestamp + +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '.ts | tonumber' | sort -n | tail -1 + +Extract max timestamp + +.. code-block:: bash + + cat *.pfw | grep -i "[^#[]" | jq -c '.ts | tonumber' | sort -n | tail -n 1 + + +For more commands on `jq` refer to `JQ Manual +`_. + diff --git a/docs/build.rst b/docs/build.rst index f70256fe..8840ef43 100644 --- a/docs/build.rst +++ b/docs/build.rst @@ -36,7 +36,7 @@ From Github .. code-block:: Bash - DFT_VERSION=v1.0.2 + DFT_VERSION=v1.0.3 pip install git+https://github.com/hariharan-devarajan/dftracer.git@${DFT_VERSION} .. attention:: diff --git a/docs/conf.py b/docs/conf.py index ded7eddf..caf4c0a3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = u'0.0' # The full version, including alpha/beta/rc tags -release = u'1.0.2' +release = u'1.0.3' # -- General configuration --------------------------------------------------- diff --git a/docs/developer-guide.rst b/docs/developer-guide.rst index e69de29b..0e270238 100644 --- a/docs/developer-guide.rst +++ b/docs/developer-guide.rst @@ -0,0 +1,69 @@ +====================== +Developer Guide +====================== + +------------------------------------------ +ALCF Polaris +------------------------------------------ + +These are steps that are needed to compile :code:`dftracer` on `ALCF Polaris `_ + +First, make sure you have set up the environment variable and source it as shown `here `_. Then, you can modify the :code:`dftracer` codebase and compile the codebase by running commands below: + +.. code-block:: bash + + module use /soft/modulefiles + module load conda + module unload darshan + conda activate base + source + + export CC=cc + export CXX=CC + export CMAKE_BUILD_TYPE=PROFILE + export DFTRACER_ENABLE_TESTS=On + export DFTRACER_LOGGER_USER=1 + export DFTRACER_DISABLE_HWLOC=On + export DFTRACER_TEST_LD_LIBRARY_PATH=/opt/cray/libfabric/1.15.2.0/lib64 + pip install -v ".[dfanalyzer]" + +.. note:: + + We need to disable :code:`darshan` here because it will give you a lot of :code:`segfault` on Polaris machine due to POSIX API interceptor done by Darshan + +Then, to run the the test, you need to run commands below: + +.. code-block:: bash + + module use /soft/modulefiles + module load conda + module unload darshan + conda activate base + source + + pip install -r test/py/requirements.txt + pushd build/temp*/*pydftracer*/ + ctest -E dlio -VV --debug --stop-on-failure + popd + + +Updating Docs +============= + +For updating the docs we need to install additional dependency :code:`Sphinx` + +.. code-block:: bash + + module use /soft/modulefiles + module load conda + module unload darshan + conda activate base + source + + pip install "Sphinx<7" + + cd /docs + make html + + +Then open :code:`_build/html/index.html` diff --git a/docs/dfanalyzer_alcf_polaris.rst b/docs/dfanalyzer_alcf_polaris.rst new file mode 100644 index 00000000..eaac7f8f --- /dev/null +++ b/docs/dfanalyzer_alcf_polaris.rst @@ -0,0 +1,187 @@ +=========================== +ALCF Polaris +=========================== + +This section describes how to run DFAnalyzer on `ALCF Polaris `_. + +---------- + +Make sure you already did the necessary steps to build the :code:`dfanalyzer` :doc:`here `. + +.. warning:: + + All the steps below should be run in the Polaris login node! + +---------------------------------------- +Logging in to Polaris Login Node +---------------------------------------- + +.. code-block:: bash + + ssh @polaris.alcf.anl.gov # and type password from MobilePass/other auth + +.. note:: + + We recommend you to use VSCode to connect to the Polaris login node as it supports opening Jupyter Notebook and do port forwarding natively + +---------------------------------------- +Initializing Dask Configurations +---------------------------------------- + +.. code-block:: bash + + cd /dfanalyzer/dask/conf + ./install_dask_env.sh + +.. note:: + + This will create new directory :code:`$HOME/.dftracer/` with files: :code:`$HOME/.dftracer/configuration.sh` and :code:`$HOME/.dftracer/configuration.yaml` + +---------------------------------------- +Changing :code:`$HOME/.dftracer/configuration.yaml` +---------------------------------------- + +.. code-block:: bash + + cd $HOME/.dftracer + configuration.yaml + +By default, :code:`$HOME/.dftracer/configuration.yaml` will contain this entry + +.. code-block:: yaml + + app: /usr/WS2/haridev/dftracer + env: ${DFTRACER_APP}/venv + +Please modify the :code:`app` your cloned :code:`` directory and :code:`env` to Python virtual environment that you used to install :code:`dfanalyzer` code :doc:`here `. + +Specifically, for ALCF Polaris, we should add one more entry to :code:`$HOME/.dftracer/configuration.yaml` + +.. code-block:: yaml + + account: + +.. note:: + + This account will be used to reserve compute node for Dask distributed workers + +---------------------------------------- +Changing :code:`polaris.yaml` config +---------------------------------------- + +.. code-block:: bash + + cd /dfanalyzer/dask/conf + polaris.yaml + +.. note:: + + Please change the :code:`polaris.yaml` configuration accordingly based on your needs. For example, if you need more nodes, you can change :code:`num_nodes` under :code:`job` key or maybe change :code:`wall_time_min`, etc. For more information regarding ALCF Polaris queue, please look here at `Running Jobs on Polaris `_ + + +---------------------------------------- +Executing scheduler +---------------------------------------- + +.. code-block:: bash + + cd /dfanalyzer/dask/scripts + ./start_dask_distributed.sh + +.. note:: + + Wait for several seconds because this script will try to reserve the compute nodes for you using PBS Job Scheduler + +.. warning:: + + If you got error with message "port" is used, you may try changing the port in :code:`` + +If it runs successfully, you should message below + +.. image:: images/dfanalyzer/polaris/run-scheduler.png + :width: 800 + :alt: Running Dask Scheduler on ALCF Polaris + +.. note:: + + Please check the file `/dfanalyzer/dask/logs/worker_.log` in case there are some problems when running the workers on compute node + +---------------------------------------- +Forwarding the Port +---------------------------------------- + +We recommend you running notebook inside VSCode because it supports port forwarding natively. In the VSCode, navigate to the bottom bar (where the terminal is). Now, click on the :code:`PORTS` tab as you can see in below screenshot + +.. image:: images/dfanalyzer/polaris/vscode-ports-tab.png + :width: 800 + :alt: VSCode Ports Tab + +--------- + +Then, click :code:`Add Port` below + +.. image:: images/dfanalyzer/polaris/vscode-add-port-button.png + :width: 800 + :alt: VSCode Add Port Button + +to add new port and type :code:`8787` since that port is used as :code:`Dask` monitoring webpage. If you type it correctly, you should show the port is added as new entry + +.. image:: images/dfanalyzer/polaris/vscode-added-port.png + :width: 800 + :alt: VSCode Added Port + +--------- + +Now, try connecting to `http://localhost:8787 `_ and, voila, you will see the :code:`Dask` scheduler monitoring! + +.. image:: images/dfanalyzer/polaris/dask-scheduler-monitoring.png + :width: 800 + :alt: Dask Scheduler Monitoring + +---------------------------------------- +Opening Notebook File +---------------------------------------- + +In your VSCode, navigate to + +.. code-block:: bash + + /examples/dfanalyzer/dfanalyzer-distributed.ipynb + +And just run each cells as usual. + +.. note:: + + Please use this as the starting point to analyze your traces. Feel free to copy and adjust it if needed! + +---------------------------------------- +Stopping Dask Distributed Workers +---------------------------------------- + +.. code-block:: bash + + cd /dfanalyzer/dask/scripts + ./stop_dask_distributed.sh + +.. note:: + + Wait for several seconds because this script will try to kill the workers and deallocate the compute nodes + +---------------------------------------- +Tips and Tricks +---------------------------------------- + +#. Add additional scripts to be executed in compute node + + Sometimes we need to execute scripts before executing worker, e.g. setup additional environment variables such as adding :code:`LD_LIBRARY_PATH` or other variables. + For this purpose, :code:`dftracer` supports this by editing :code:`$HOME/.dftracer/configuration.sh`. + + .. code-block:: bash + + $HOME/.dftracer/configuration.sh + # + # add new line at the end of the file + # e.g. + # export LD_LIBRARY_PATH=/opt/cray/libfabric/1.15.2.0/lib64:${LD_LIBRARY_PATH} + + diff --git a/docs/dfanalyzer_build.rst b/docs/dfanalyzer_build.rst new file mode 100644 index 00000000..ea4813e1 --- /dev/null +++ b/docs/dfanalyzer_build.rst @@ -0,0 +1,37 @@ +=========================== +Build +=========================== + +------------------------------------------ +From source (Recommended) +------------------------------------------ + +.. code-block:: Bash + + git clone git@github.com:hariharan-devarajan/dftracer.git + cd dftracer + pip install ".[dfanalyzer]" + +------------------------------------------ +From pip +------------------------------------------ + +.. code-block:: Bash + + pip install pydftracer[dfanalyzer] + +=============================== +Getting Started with DFAnalyzer +=============================== + +The most user-friendly way to utilize DFAnalyzer to analyze traces from DFTracer is to use Jupyter Notebooks. +To run the notebook you will have to install Jupyter. We have a simple requirement.txt file for that as well. + + +.. code-block:: Bash + + cd dftracer + pip install -r examples/dfanalyzer/requirements.txt + + +A simple example of loading DFAnalyzer and quick recommended queries are available on Navigate to :code:`/examples/dfanalyzer/dfanalyzer_distributed.ipynb` and run your notebook. \ No newline at end of file diff --git a/docs/dfanalyzer_conf.rst b/docs/dfanalyzer_conf.rst new file mode 100644 index 00000000..b256ba5e --- /dev/null +++ b/docs/dfanalyzer_conf.rst @@ -0,0 +1,107 @@ +====================================== +Running Dask Distributed in a new system +====================================== + +This section describes how to configure DFAnalyzer to run on your cluster. + +---------- + +Make sure you have already completed the necessary steps to build the :code:`dfanalyzer`. See the :doc:`build` documentation for details. + +---------------------------------------- +Initializing Dask Configurations +---------------------------------------- + +.. code-block:: bash + + cd /dfanalyzer/dask/conf + ./install_dask_env.sh + +.. note:: + + This will create a new directory :code:`$HOME/.dftracer/` with files: :code:`$HOME/.dftracer/configuration.sh` and :code:`$HOME/.dftracer/configuration.yaml` + +---------------------------------------- +Editing :code:`$HOME/.dftracer/configuration.yaml` +---------------------------------------- + +.. code-block:: bash + + cd $HOME/.dftracer + configuration.yaml + +By default, :code:`$HOME/.dftracer/configuration.yaml` will contain this entry: + +.. code-block:: yaml + + app: /usr/WS2/haridev/dftracer + env: ${DFTRACER_APP}/venv + +Please modify the :code:`app` to your cloned :code:`` directory and :code:`env` to the Python virtual environment that you used to install the :code:`dfanalyzer` code. Refer to the :doc:`build` documentation for details. + +---------------------------------------- +Editing Dask Configurations depending on your System +---------------------------------------- + +In the `/dfanalyzer/dask/conf/` folder create a new `.yaml` file for the system you want to use. The `.yaml` file should consist of the following fields: + +- config: this fiels contains locations for the directories containing files for the dask distributed cluster. + - script_dir: the scripts to run dask distributed + - conf_dir: the `.yaml` configuration files + - run_dir: the folder which will contain the :code:`scheduler_{$USER}.pid` and :code:`scheduler_{$USER}.json` file used to store information for the scheduler. + - log_dir: the folder which will contain the logs for the dask distributed scheduler and workers. +- job: information about the job you want to run to create the dask distributed cluster. + - num_nodes: number of nodes which are going to be used to run the dask distributed cluster. + - wall_time_min: time (in minutes) which the dask distributed cluster is going to run for. + - env_id: the name of the job which will run. + - queue: the queue which the job will run for. +- scheduler: information used to run the scheduler of the dask distributed cluster. + - cmd: command used to run the scheduler. Depending on the system you are using you might need to use FLUX, SLURM or other scheduler. + Examples can look like this: :code:`srun -N ${DFTRACER_JOB_NUM_NODES} -t ${DFTRACER_JOB_WALL_TIME_MIN}` for SLURM scheduler or :code:`flux run -N ${DFTRACER_JOB_NUM_NODES} -t ${DFTRACER_JOB_WALL_TIME_MIN}` for FLUX scheduler. + - port: :code:``` used to run dask distributed. + - kill: command used to kill the cluster. + Examples can look like this: :code:`scancel ${SLURM_JOB_ID}` for SLURM scheduler or `flux cancel --all` for FLUX scheduler. +- worker: information used to run the workers of the dask distributed cluster. + - ppn: processes per node for the dask distributed cluster. + - cmd: command used to run the worker. Depending on the system you are using you might need to use FLUX, SLURM or other scheduler. + Examples can look like this: :code:`srun -N ${DFTRACER_JOB_NUM_NODES} --ntasks-per-node=${DFTRACER_WORKER_PPN}` for SLURM scheduler or :code:`srun -N ${DFTRACER_JOB_NUM_NODES} --ntasks-per-node=${DFTRACER_WORKER_PPN}` for FLUX scheduler. + - per_core: number of processes per code + - threads: number of threads used. + - local_dir: a location for a local directory used from dask to cache data frames. It can be set to local storage or shared memory. + - kill: command used to kill the cluster. + Examples can look like this: :code:`scancel ${SLURM_JOB_ID}` for SLURM scheduler or :code:`flux cancel --all` for FLUX scheduler. + +.. code-block:: bash + + cd /dfanalyzer/dask/conf + .yaml + +Bellow is an example of a `.yaml` taht can used for LC Ruby: + +.. code-block:: bash + config: + script_dir: ${DFTRACER_APP}/dfanalyzer/dask/scripts + conf_dir: ${DFTRACER_APP}/dfanalyzer/dask/conf + run_dir: ${DFTRACER_APP}/dfanalyzer/dask/run_dir + log_dir: ${DFTRACER_APP}/dfanalyzer/dask/logs + job: + num_nodes: 1 + wall_time_min: 60 + env_id: SLURM_JOB_ID + worker: + ppn: 16 + cmd: srun -N ${DFTRACER_JOB_NUM_NODES} -t ${DFTRACER_JOB_WALL_TIME_MIN} + per_core: 1 + threads: 1 + local_dir: /dev/shm/dask-workspace + kill: scancel ${SLURM_JOB_ID} + scheduler: + cmd: srun -N ${DFTRACER_JOB_NUM_NODES} -t ${DFTRACER_JOB_WALL_TIME_MIN} --ntasks-per-node=${DFTRACER_WORKER_PPN} + port: 12001 + kill: scancel ${SLURM_JOB_ID} + +---------------------------------------- +Run DFAnalyzer +---------------------------------------- + +Navigate to :code:`/examples/dfanalyzer/dfanalyzer_distributed.ipynb` and run your notebook. diff --git a/docs/dfanalyzer_distributed.rst b/docs/dfanalyzer_distributed.rst new file mode 100644 index 00000000..b7cf7dc9 --- /dev/null +++ b/docs/dfanalyzer_distributed.rst @@ -0,0 +1,87 @@ +=================================== +Running DFAnalyzer with Dask Distributed +=================================== + +----------------------------------- +Getting Started +----------------------------------- + +1. Create a Python virtual environment (Python version>3.7). +2. Source the Python virtual environment. +3. Git clone the GitHub repo to get the source code of DFTracer. +4. Navigate into :code:`/path/to/dftracer/dfanalyzer/examples/dfanalyzer`. +5. Build DFTracer as recommended in :doc:`build`. +6. Get all of the requirements as follows in the terminal: + +.. code-block:: bash + + pip install -r requirements.txt + +7. Create a `.yaml` file in :code:`/path/to/dftracer/dfanalyzer/dask/conf` if this is a new system. Please refer to :doc:`dfanalyzer_conf`. + +----------------------------------- +Starting a Dask Distributed Cluster +----------------------------------- + +In the terminal: + +.. code-block:: bash + + cd /path/to/dftracer/dfanalyzer/dask/conf + ./install_dask_env.sh + +This will create the `configuration.yaml` in :code:`~/.dftracer`. Update the application and environment path in `configuration.yaml`. You may need to create `run_dir` and `logs` folders if they aren't there already. + +.. code-block:: bash + + cd /path/to/dftracer/dfanalyzer/dask/ + # if logs folder is not present + mkdir logs + # if run_dir is not present + mkdir run_dir + install + ./scripts/start_dask_distributed.sh + +.. note:: + + Wait for several seconds as this script will reserve the compute nodes for you using the job scheduler. + +.. note:: + + Please check the log file :code:`/path/to/dftracer/dfanalyzer/dask/logs/worker_.log` for any issues with running the workers on the compute nodes. + +.. warning:: + + For errors related to port usage, please check if you already have any Dask distributed instances running. You can do so by checking the jobs already running in your scheduler queue or by running the following command in the terminal: + + .. code-block:: bash + + ps -aef | grep dask + + Then kill those jobs/processes using :code:`kill -9 `. You may also need to change the port number in the `.yaml` files located at `/path/to/dftracer/dfanalyzer/dask/conf`. For more details about these configurations refer to :doc:`here `. + + +----------------------------------- +Use DFAnalyzer +----------------------------------- + +To use the Jupyter notebook of DFAnalyzer, navigate to :code:`/path/to/dftracer/examples` and find the `dfanalyzer_distributed.ipynb`. + +---------------------------------------- +Acessing the Dask Dashboard +---------------------------------------- + +It is recommended to run the notebook inside VSCode because it supports port forwarding natively. In VSCode, navigate to the bottom bar (where the terminal is), and click on the :code:`PORTS` tab. Click :code:`Forward Port` to add a new port and type the port that was used when :code:`setup_dask_cluster()` was run in your `dfanalyzer.ipynb` notebook. Connect to `http://localhost:PORT `_ to see the :code:`Dask` scheduler monitoring. + +---------------------------------------- +Stopping Dask Distributed Workers +---------------------------------------- + +.. code-block:: bash + + cd /path/to/dftracer/dfanalyzer/dask/scripts + ./stop_dask_distributed.sh + +.. note:: + + Wait for several seconds as this script will terminate the workers and deallocate the compute nodes. diff --git a/docs/examples.rst b/docs/examples.rst index a94d98cc..afafa7d4 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -1,10 +1,6 @@ ================ Example Programs -================ - - ------------ - +================` ------------ C++ Example diff --git a/docs/images/dfanalyzer/polaris/dask-scheduler-monitoring.png b/docs/images/dfanalyzer/polaris/dask-scheduler-monitoring.png new file mode 100644 index 00000000..7310ddcc Binary files /dev/null and b/docs/images/dfanalyzer/polaris/dask-scheduler-monitoring.png differ diff --git a/docs/images/dfanalyzer/polaris/run-scheduler.png b/docs/images/dfanalyzer/polaris/run-scheduler.png new file mode 100644 index 00000000..0aa09460 Binary files /dev/null and b/docs/images/dfanalyzer/polaris/run-scheduler.png differ diff --git a/docs/images/dfanalyzer/polaris/vscode-add-port-button.png b/docs/images/dfanalyzer/polaris/vscode-add-port-button.png new file mode 100644 index 00000000..dad8ce27 Binary files /dev/null and b/docs/images/dfanalyzer/polaris/vscode-add-port-button.png differ diff --git a/docs/images/dfanalyzer/polaris/vscode-added-port.png b/docs/images/dfanalyzer/polaris/vscode-added-port.png new file mode 100644 index 00000000..83977c8d Binary files /dev/null and b/docs/images/dfanalyzer/polaris/vscode-added-port.png differ diff --git a/docs/images/dfanalyzer/polaris/vscode-ports-tab.png b/docs/images/dfanalyzer/polaris/vscode-ports-tab.png new file mode 100644 index 00000000..668b4b80 Binary files /dev/null and b/docs/images/dfanalyzer/polaris/vscode-ports-tab.png differ diff --git a/docs/images/tracing/1000genome.png b/docs/images/tracing/1000genome.png new file mode 100644 index 00000000..3bf4b560 Binary files /dev/null and b/docs/images/tracing/1000genome.png differ diff --git a/docs/images/tracing/Montage_dur.png b/docs/images/tracing/Montage_dur.png new file mode 100644 index 00000000..8970d962 Binary files /dev/null and b/docs/images/tracing/Montage_dur.png differ diff --git a/docs/images/tracing/Montage_graph.png b/docs/images/tracing/Montage_graph.png new file mode 100644 index 00000000..a8299012 Binary files /dev/null and b/docs/images/tracing/Montage_graph.png differ diff --git a/docs/images/tracing/Montage_summary.png b/docs/images/tracing/Montage_summary.png new file mode 100644 index 00000000..cd014e2a Binary files /dev/null and b/docs/images/tracing/Montage_summary.png differ diff --git a/docs/index.rst b/docs/index.rst index 5a9bde2f..9e457141 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,11 +18,35 @@ DFTracer: is a library for profiling I/O calls and application functions. building_applications api +.. toctree:: + :maxdepth: 2 + :caption: DFAnalyzer + + dfanalyzer_build + dfanalyzer_distributed + dfanalyzer_conf + dfanalyzer_alcf_polaris + +.. toctree:: + :maxdepth: 2 + :caption: Utilities + + utilities + bash_utilities + .. toctree:: :maxdepth: 2 :caption: Reference examples + migration + +.. toctree:: + :maxdepth: 2 + :caption: Applications + + pegasus_montage + pegasus_genome .. toctree:: :maxdepth: 2 diff --git a/docs/migration.rst b/docs/migration.rst new file mode 100644 index 00000000..eaab9d26 --- /dev/null +++ b/docs/migration.rst @@ -0,0 +1,473 @@ +================================================ +Migration from DLIO Profiler to DFTracer +================================================ + +This section provides information to DLIO Profiler users on how to migrate their work from DLIO profiler to DFTracer. + +------------------------------------------------ +Installation +------------------------------------------------ + +------------------------------------------------ +Application building migration +------------------------------------------------ +To migrate your Makefile projects from using DLIO Profiler to DFTracer, you will need to update your compilation flags, specifically `CFLAGS` or `CXXFLAGS`, and `LDFLAGS`. +Replace the DLIO Profiler flags with DFTracer flags as shown below: + +.. code-block:: make + :linenos: + :caption: Modifying Makefile to use DFTracer + + # DLIO Profiler Flags (old) + DLIO_CFLAGS = -I/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dlio_profiler/include + DLIO_LDFLAGS = -L/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dlio_profiler/lib64 -ldlio_profiler + CFLAGS += $(DLIO_CFLAGS) + LIBS += $(DLIO_LDFLAGS) + + # Replace with DFTracer Flags (new) + # Add DFTracer include and library paths + DFTRACER_CFLAGS = -I/path/to/dftracer/include + DFTRACER_LDFLAGS = -L/path/to/dftracer/lib64 -ldftracer + + # Append to existing CFLAGS and LDFLAGS + CFLAGS += $(DFTRACER_CFLAGS) + LDFLAGS += $(DFTRACER_LDFLAGS) + +------------------------------------------------ +C++ API Changes +------------------------------------------------ + +This section guides you through the necessary changes to migrate your application-level tracing for C++ projects from DLIO Profiler to DFTracer. The transition requires updating API names and includes directives to use the DFTracer's new API. +Please see `examples.rst` for more information on how to use DFTracer APIs. + +Updating Includes +--------------------- + +Replace the old DLIO Profiler include header with the new DFTracer header. This change points your application to the new tracing API. + +.. code-block:: cpp + :linenos: + + // Old include + #include + + // Replace with new include + #include + +Initializing +------------------------ + +Initialization now uses the DFTracer API, which can seamlessly integrate into your existing codebase where DLIO Profiler was previously initialized. + +.. code-block:: cpp + :linenos: + + // Old initialization + DLIO_PROFILER_CPP_INIT(log_file, data_dirs, process_id); + + // Replace with new initialization + DFTRACER_CPP_INIT(log_file, data_dirs, process_id); + +This will configure the DFTracer environment, setting up the log file, data directories, and process ID exactly like the DLIO Profiler did. +To migrate these configurations from DLIO Profile to DFTracer please replace your old enviromental variable configurations as shown bellow. + +.. code-block:: bash + :linenos: + # Old environment variable configurations for DLIO Profiler + DLIO_LOG_FILE=~/dlio_log + DLIO_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset + export DLIO_INIT=PRELOAD + export DLIO_ENABLE=1 + + +.. code-block:: bash + :linenos: + # Updated environment variable configurations for DFTracer + DFTRACER_LOG_FILE=~/log_file # Changes the log file path variable name + DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset # Consistent data directory path + export DFTRACER_INIT=PRELOAD # Standardizing to PRELOAD mode + export DFTRACER_ENABLE=1 # Enabling the profiler + + +Finalizing +---------------------- + +The finalization process ensures that all tracing data are correctly finalized and saved. Replace the DLIO Profiler finalization call with the DFTracer finalization. + +.. code-block:: cpp + :linenos: + + // Old finalization + DLIO_PROFILER_CPP_FINI(); + + // Replace with new finalization + DFTRACER_CPP_FINI(); + +This function call is crucial for ensuring that your profiling data is not corrupted and is properly written to the log file. + +Function and Region Profiling +----------------------------------- + +For function and code block profiling, replace the old DLIO Profiler functions with their DFTracer counterparts. + +.. code-block:: cpp + :linenos: + + // Old function and region profiling + DLIO_PROFILER_CPP_FUNCTION(); + DLIO_PROFILER_CPP_REGION_(CUSTOM); + + // Replace with new function and region profiling + DFTRACER_CPP_FUNCTION(); + DFTRACER_CPP_REGION_(CUSTOM); + + +------------------------------------------------ +C API Changes +------------------------------------------------ + +This section guides you through the necessary changes to migrate your application-level tracing for C projects from DLIO Profiler to DFTracer. The transition requires updating API names and includes directives to use the DFTracer's new API. +Please see `examples.rst` for more information on how to use DFTracer APIs. + +Updating Includes +--------------------- + +To transition your C projects to DFTracer, begin by updating the include directive to point to the new DFTracer API. + +.. code-block:: c + :linenos: + + // Old include + #include + + // Replace with new include + #include + +Initializing +------------------------ + +For C applications, DFTracer initialization replaces the older DLIO Profiler calls. + +.. code-block:: c + :linenos: + + // Old initialization + DLIO_PROFILER_C_INIT(log_file, data_dirs, process_id); + + // Replace with new initialization + DFTRACER_C_INIT(log_file, data_dirs, process_id); + +This command configures DFTracer with the necessary parameters for logging and directory monitoring, similarly to how DLIO Profiler was configured. +To migrate these configurations from DLIO Profile to DFTracer please replace your old enviromental variable configurations as shown bellow. + +.. code-block:: bash + :linenos: + + # Old environment variable configurations for DLIO Profiler + DLIO_LOG_FILE=~/dlio_log + DLIO_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset + export DLIO_INIT=PRELOAD + export DLIO_ENABLE=1 + + +.. code-block:: bash + :linenos: + + # Updated environment variable configurations for DFTracer + DFTRACER_LOG_FILE=~/log_file # Changes the log file path variable name + DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset # Consistent data directory path + export DFTRACER_INIT=PRELOAD # Standardizing to PRELOAD mode + export DFTRACER_ENABLE=1 # Enabling the profiler + + +Finalizing +---------------------- + +Finalize the DFTracer setup to ensure all tracing data are correctly captured and saved. + +.. code-block:: c + :linenos: + + // Old finalization + DLIO_PROFILER_C_FINI(); + + // Replace with new finalization + DFTRACER_C_FINI(); + + +Function and Region Profiling +----------------------------------- + +Transition function and region profiling in your C code to use DFTracer's updated API methods. + +.. code-block:: c + :linenos: + + // Old function and region profiling + DLIO_PROFILER_C_FUNCTION_START(); + DLIO_PROFILER_C_FUNCTION_END(); + + // Replace with new function and region profiling + DFTRACER_C_FUNCTION_START(); + DFTRACER_C_FUNCTION_END(); + + +------------------------------------------------ +Python API changes +------------------------------------------------ + +------------------------------------------------ +Application building migration +------------------------------------------------ +To migrate your Makefile projects from using DLIO Profiler to DFTracer, you will need to update your compilation flags, specifically `CFLAGS` or `CXXFLAGS`, and `LDFLAGS`. +Replace the DLIO Profiler flags with DFTracer flags as shown below: + +.. code-block:: make + :linenos: + :caption: Modifying Makefile to use DFTracer + + # DLIO Profiler Flags (old) + DLIO_CFLAGS = -I/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dlio_profiler/include + DLIO_LDFLAGS = -L/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dlio_profiler/lib64 -ldlio_profiler + CFLAGS += $(DLIO_CFLAGS) + LIBS += $(DLIO_LDFLAGS) + + # Replace with DFTracer Flags (new) + # Add DFTracer include and library paths + DFTRACER_CFLAGS = -I/path/to/dftracer/include + DFTRACER_LDFLAGS = -L/path/to/dftracer/lib64 -ldftracer + + # Append to existing CFLAGS and LDFLAGS + CFLAGS += $(DFTRACER_CFLAGS) + LDFLAGS += $(DFTRACER_LDFLAGS) + +------------------------------------------------ +C++ API Changes +------------------------------------------------ + +This section guides you through the necessary changes to migrate your application-level tracing for C++ projects from DLIO Profiler to DFTracer. The transition requires updating API names and includes directives to use the DFTracer's new API. +Please see `examples.rst` for more information on how to use DFTracer APIs. + +Updating Includes +--------------------- + +Replace the old DLIO Profiler include header with the new DFTracer header. This change points your application to the new tracing API. + +.. code-block:: cpp + :linenos: + + // Old include + #include + + // Replace with new include + #include + +Initializing +------------------------ + +Initialization now uses the DFTracer API, which can seamlessly integrate into your existing codebase where DLIO Profiler was previously initialized. + +.. code-block:: cpp + :linenos: + + // Old initialization + DLIO_PROFILER_CPP_INIT(log_file, data_dirs, process_id); + + // Replace with new initialization + DFTRACER_CPP_INIT(log_file, data_dirs, process_id); + +This will configure the DFTracer environment, setting up the log file, data directories, and process ID exactly like the DLIO Profiler did. +To migrate these configurations from DLIO Profile to DFTracer please replace your old enviromental variable configurations as shown bellow. + + +.. code-block:: bash + :linenos: + + # Old environment variable configurations for DLIO Profiler + DLIO_LOG_FILE=~/dlio_log + DLIO_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset + export DLIO_INIT=PRELOAD + export DLIO_ENABLE=1 + + +.. code-block:: bash + :linenos: + + # Updated environment variable configurations for DFTracer + DFTRACER_LOG_FILE=~/log_file # Changes the log file path variable name + DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset # Consistent data directory path + export DFTRACER_INIT=PRELOAD # Standardizing to PRELOAD mode + export DFTRACER_ENABLE=1 # Enabling the profiler + + +Finalizing +---------------------- + +The finalization process ensures that all tracing data are correctly finalized and saved. Replace the DLIO Profiler finalization call with the DFTracer finalization. + +.. code-block:: cpp + :linenos: + + // Old finalization + DLIO_PROFILER_CPP_FINI(); + + // Replace with new finalization + DFTRACER_CPP_FINI(); + +This function call is crucial for ensuring that your profiling data is not corrupted and is properly written to the log file. + +Function and Region Profiling +----------------------------------- + +For function and code block profiling, replace the old DLIO Profiler functions with their DFTracer counterparts. + +.. code-block:: cpp + :linenos: + + // Old function and region profiling + DLIO_PROFILER_CPP_FUNCTION(); + DLIO_PROFILER_CPP_REGION_(CUSTOM); + + // Replace with new function and region profiling + DFTRACER_CPP_FUNCTION(); + DFTRACER_CPP_REGION_(CUSTOM); + + +------------------------------------------------ +C API Changes +------------------------------------------------ + +This section guides you through the necessary changes to migrate your application-level tracing for C projects from DLIO Profiler to DFTracer. The transition requires updating API names and includes directives to use the DFTracer's new API. +Please see `examples.rst` for more information on how to use DFTracer APIs. + +Updating Includes +--------------------- + +To transition your C projects to DFTracer, begin by updating the include directive to point to the new DFTracer API. + +.. code-block:: c + :linenos: + + // Old include + #include + + // Replace with new include + #include + +Initializing +------------------------ + +For C applications, DFTracer initialization replaces the older DLIO Profiler calls. + +.. code-block:: c + :linenos: + + // Old initialization + DLIO_PROFILER_C_INIT(log_file, data_dirs, process_id); + + // Replace with new initialization + DFTRACER_C_INIT(log_file, data_dirs, process_id); + +This command configures DFTracer with the necessary parameters for logging and directory monitoring, similarly to how DLIO Profiler was configured. +To migrate these configurations from DLIO Profile to DFTracer please replace your old enviromental variable configurations as shown bellow. + +.. code-block:: bash + :linenos: + + # Old environment variable configurations for DLIO Profiler + DLIO_LOG_FILE=~/dlio_log + DLIO_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset + export DLIO_INIT=PRELOAD + export DLIO_ENABLE=1 + + +.. code-block:: bash + :linenos: + + # Updated environment variable configurations for DFTracer + DFTRACER_LOG_FILE=~/log_file # Changes the log file path variable name + DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset # Consistent data directory path + export DFTRACER_INIT=PRELOAD # Standardizing to PRELOAD mode + export DFTRACER_ENABLE=1 # Enabling the profiler + + +Finalizing +---------------------- + +Finalize the DFTracer setup to ensure all tracing data are correctly captured and saved. + +.. code-block:: c + :linenos: + + // Old finalization + DLIO_PROFILER_C_FINI(); + + // Replace with new finalization + DFTRACER_C_FINI(); + + +Function and Region Profiling +----------------------------------- + +Transition function and region profiling in your C code to use DFTracer's updated API methods. + +.. code-block:: c + :linenos: + + // Old function and region profiling + DLIO_PROFILER_C_FUNCTION_START(); + DLIO_PROFILER_C_FUNCTION_END(); + + // Replace with new function and region profiling + DFTRACER_C_FUNCTION_START(); + DFTRACER_C_FUNCTION_END(); + + +------------------------------------------------ +Python API changes +------------------------------------------------ + + +------------------------------------------------ +Analyzer Changes +------------------------------------------------ + +Migration of the DLP Analyzer jupyter notebook to DFAnalyzer involves configuring the YAML for Dask and renaming the imports and function calls in jupyter notebook cells. + + +Dask Configuration: +************************** + + +1. ``cd`` to ``dftracer/dfanalyzer/dask/conf`` and run ``install_dask_env.sh`` to create configuration.yaml in ``~/.dftracer``. +2. update the app and environment path in ``configuration.yaml``. + +Jupyter Notebook Update: +************************** + + +1. update ``app_root`` variable by updating path of new ``configuration.yaml``. +2. replace ``dlp_analyzer`` with ``dfanalyzer`` and update the imports form ``dfanalyzer.main`` + +.. code-block:: python + :linenos: + + ... + import dfanalyzer + from dfanalyzer.main import DFAnalyzer,get_dft_configuration,update_dft_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dft_configuration + ... + +3. update the ``dask_run_dir`` to use dfanalyzer instead of dlp_analyzer. +4. rename update and get configuration functions by calling DFtracer equivalent functions. + +.. code-block:: python + :linenos: + + ... + conf = update_dft_configuration(dask_scheduler=dask_scheduler, verbose=True, + log_file=f"./dft_{os.getenv('USER')}.log", rebuild_index=False, time_approximate=False, + host_pattern=r'lassen(\d+)', time_granularity=30e6, skip_hostname=True, conditions=condition_fn) + conf = get_dft_configuration() + ... + + diff --git a/docs/pegasus_genome.rst b/docs/pegasus_genome.rst new file mode 100644 index 00000000..fdec3031 --- /dev/null +++ b/docs/pegasus_genome.rst @@ -0,0 +1,197 @@ +=================================== +Pegasus 1000-Genome with DFTracer +=================================== + +Instructions for tracing Pegasus 1000 Genome with DFTracer on LC Corona. These instructions can be used for any Workflow but you'll need to change the version of the tar files depending on the architecture of your machine and the workflow you are interested in. +For more information, visit the `workflows repository `_. + +To follow this tutorial you will need the following requirements: +- Installed Condor. +- Installed Pegasus Workflow Manager. +- Python Virtual Environment with DFTracer installed. + +Please refer to :doc:`pegasus_montage` for completing these requirements. + +Step 1: Activate Environment +---------------------------- + +Source the Python Virtual Environmnet that has DFTracer. + +1.1 Create and activate Virtual Environment: + +.. code-block:: bash + + python3 -m venv /path/to/pegasus-env + source /path/to/pegasus-env/bin/activate + +1.2 Get the dependencies: + +.. code-block:: bash + + pip install git+https://github.com/hariharan-devarajan/dftracer.git + +Step 2: Get the 1000genome-workflow +----------------------------------- + +Get the code: + +.. code-block:: bash + + git clone https://github.com/pegasus-isi/1000genome-workflow.git + +Step 3: Prepare software for Pegasus-1000Genome +------------------------------------ + +3.1 Save to PATH: + +.. code-block:: bash + + export PATH=/path/to/pegasus/install/bin:$PATH + export PATH=/path/to/pegasus/install/sbin:$PATH + export LD_LIBRARY_PATH=/path/to/pegasus/install//lib:$LD_LIBRARY_PATH + source ~/.bashrc + +3.2 Run Condor: + +.. code-block:: bash + + chmod 777 /path/to/pegasus/install/condor.sh + . /path/to/pegasus/install/condor.sh + condor_master + condor_status # it should show the activity + condor_q # it should show the jobs running + +.. note:: + + If errors occur, echo the `LD_LIBRARY_PATH` and the `PATH` and make sure :code:`/pegasus/install` is there. + +To check if condor_shedd and all other condor processes are running: + +.. code-block:: bash + + ps aux | grep condor + +If Condor throws errors while trying to connect to another node: + +1. Exit the flux allocation: + +.. code-block:: bash + + exit + +2. Check your processes: + +.. code-block:: bash + + ps -u ${USER} + +3. Kill all your processes (or those related to Condor if any): + +.. code-block:: bash + + killall -u ${USER} + +4. Repeat steps 6.3, 6.4, 5.2, 6.5 + +5. If the problem persists: + +.. code-block:: bash + + condor_restart + +3.3 Test Pegasus: + +.. code-block:: bash + + pegasus-version # should show 5.0.7 + +.. note:: + +If error "Cannot find file with permissions" occurs, touch that file and make sure it has those permissions. + +3.4 Configure the Condor/SLURM interface: + +.. code-block:: bash + + pegasus-configure-glite + +.. note:: + + If error "Cannot find file with permissions" occurs, touch that file and make sure it has those permissions. + +3.5 Configure the DFTracer flags: + +.. code-block:: bash + + export DFTRACER_INSTALLED=/path/to/pegasus-env/lib/python3.9/site-packages/dftracer/ + export LD_LIBRARY_PATH=$DFTRACER_INSTALLED/lib:$DFTRACER_INSTALLED/lib64:$LD_LIBRARY_PATH + export DFTRACER_LOG_FILE=/path/to/traces/trace + # export DFTRACER_DATA_DIR=all (optional) + export DFTRACER_ENABLE=1 + export DFTRACER_INC_METADATA=1 + # export DFTRACER_INIT=PRELOAD (optional) + export DFTRACER_BIND_SIGNALS=0 + # export DFTRACER_LOG_LEVEL=ERROR (optional) + export DFTRACER_TRACE_COMPRESSION=1 + # dftracer=$DFTRACER_INSTALLED/lib64/libdftracer_preload.so (optional) + +You would only need to use the preload version of DFTracer if you have not annotated the application code you are running. +For more information on the flags and their functionalities please turn to :docs:`examples`. + +Step 4: Annotate 1000-Genome +--------------------------- + +4.1 Configure the DFTracer flags: + +.. code-block:: bash + + export DFTRACER_INSTALLED=/path/to/pegasus-env/lib/python3.9/site-packages/dftracer/ + export LD_LIBRARY_PATH=$DFTRACER_INSTALLED/lib:$DFTRACER_INSTALLED/lib64:$LD_LIBRARY_PATH + export DFTRACER_LOG_FILE=/path/to/traces/trace + # export DFTRACER_DATA_DIR=all (optional) + export DFTRACER_ENABLE=1 + export DFTRACER_INC_METADATA=1 + # export DFTRACER_INIT=PRELOAD (optional) + export DFTRACER_BIND_SIGNALS=0 + # export DFTRACER_LOG_LEVEL=ERROR (optional) + export DFTRACER_TRACE_COMPRESSION=1 + # dftracer=$DFTRACER_INSTALLED/lib64/libdftracer_preload.so (optional) + + +4.2 Navigate to the :code:`/path/to/1000genome-workflow` directory. The source code that is useful to annotate and "time" for Monatge is in the `/bin` folder. As an example we use the `frequency.py` application which is located in :code:`/path/to/1000genome-workflow/bin` folder. We annotate the code as follows: + +.. code-block:: python + + from dftracer.logger import dftracer, dft_fn + log_inst = dftracer.initialize_log(logfile=None, data_dir=None, process_id=-1) + + CAT = "PY_APP" + + df_log = dft_fn(CAT) + + ... + + class ... + ... + if __name__ == '__main__': + with dft_fn(name=f"frequency", cat=CAT): + (code...) + log_inst.finalize() + +The idea is to annotate the application so that we capture all the calls that occur during the running of the main fuction. These application calls will appear on the traces as events with "CAT:PY_APP" and their name will be "frequency". We can annotate further, by creating regions. For more details please refer to :doc:`examples.rst`. + +4.3 After the annotation with DFTracer, we can run the workflow with pegasus after first creating the data, planning the workflow and executing with `pegasus-run`: + +.. code-block:: bash + + ./prepare_input.sh + ./daxgen.py + ./daxgen.py -D 20130502 -f data.csv -i 1 + +For more information please visit `https://github.com/pegasus-isi/1000genome-workflow/tree/master`. + +4.4 After the workflow finishes we navigate into :code:`/path/to/traces/` that we set earlier with the DFTracer flags. We then load those traces on DFAnalyzer. The application calls will also have "CAT: PY_APP" as this is a Python code workflow. Here is the result of tracing 1000 Genome on LC Corona with 32 nodes and 48 processes per node using DFTracer and analyzing the tracing using DFAnalyzer: + +.. image:: images/tracing/1000genome.png + :width: 800 + :alt: Aggregate duration of 1000 Genome applications. \ No newline at end of file diff --git a/docs/pegasus_montage.rst b/docs/pegasus_montage.rst new file mode 100644 index 00000000..b36a7685 --- /dev/null +++ b/docs/pegasus_montage.rst @@ -0,0 +1,371 @@ +=================================== +Pegasus Montage with DFTracer +=================================== + +Instructions for tracing Pegasus Montage with DFTracer on LC Corona. These instructions can be used for any Workflow but you'll need to change the version of the tar files depending on the architecture of your machine and the workflow you are interested in. +For more information, visit the `workflows repository `_. + +Step 1: Install Condor +---------------------- + +1.1 Get the zip: + +.. code-block:: bash + + wget https://research.cs.wisc.edu/htcondor/tarball/10.x/current/condor-x86_64_CentOS8-stripped.tar.gz + +1.2 Untar to your condor folder: + +.. code-block:: bash + + tar -x -f condor*.tar.gz + mkdir condor + cd condor-*stripped + mv * ../condor + cd .. + rm -rf condor-*stripped + rm condor-stripped.tar.gz + +1.3 Configure: + +.. code-block:: bash + + cd condor + ./bin/make-personal-from-tarball + +Step 2: Install Pegasus +----------------------- + +2.1 Get the zip from Tarballs: + +.. code-block:: bash + + wget https://download.pegasus.isi.edu/pegasus/5.0.7/pegasus-binary-5.0.7-x86_64_rhel_7.tar.gz + wget https://download.pegasus.isi.edu/pegasus/5.0.7/pegasus-worker-5.0.7-x86_64_rhel_7.tar.gz + +2.2 Untar to your Pegasus folder (both for pegasus and pegasus-worker): + +.. code-block:: bash + + tar zxf pegasus-*.tar.gz + rm pegasus-*.tar.gz + +Step 3: Install and compile Montage +----------------------------------- + +3.1 Get the code: + +.. code-block:: bash + + git clone https://github.com/Caltech-IPAC/Montage.git + +3.2 Compile: + +.. code-block:: bash + + cd Montage + make + +.. note:: + + Make sure there are no errors. By cloning the GitHub repo, you get the most recent version, likely with no compiler errors. :code:`cd Montage/bin` and make sure it is not empty. + +3.3 Save in Paths: + +.. code-block:: bash + + export PATH=/path/to/Montage/bin:$PATH + +Step 4: Get the montage-pegasus-v3 +---------------------------------- + +4.1 Create and activate Virtual Environment: + +.. code-block:: bash + + python3 -m venv /path/to/pegasus-env + source /path/to/pegasus-env/bin/activate + +4.2 Install dependencies: + +.. code-block:: bash + + pip install astropy + pip install pegasus-wms + pip install git+https://github.com/hariharan-devarajan/dftracer.git + +4.3 Get the code: + +.. code-block:: bash + + git clone https://github.com/pegasus-isi/montage-workflow-v3.git + +Step 5: Compile the pegasus-mpi-cluster from source +--------------------------------------------------- + +5.1 Get the code: + +.. code-block:: bash + + git clone https://github.com/pegasus-isi/pegasus.git + +5.2 Make sure you’re in the virtual environment for Pegasus: + +.. code-block:: bash + + source /path/to/pegasus-env/bin/activate + +5.3 Make sure you have the prerequisites: + +1. Git +2. Java 8 or higher +3. Python 3.5 or higher +4. R +5. Ant +6. gcc +7. g++ +8. make +9. tox 3.14.5 or higher +10. mysql (optional, required to access MySQL databases) +11. postgresql (optional, required to access PostgreSQL databases) +12. Python pyyaml +13. Python GitPython + +5.4 Compile: + +.. code-block:: bash + + cd pegasus + ant compile-pegasus-mpi-cluster + +5.5 Copy it to your Pegasus folder: + +.. code-block:: bash + + cd packages/pegasus-mpi-cluster/ + cp pegasus-mpi-cluster/ /path/to/pegasus-5.0.7/bin + +.. note:: + + If errors occur while compiling, make sure that `MVAPICH` is loaded: + +.. code-block:: bash + + module load mvapich2-tce/2.3.7 + echo $LD_LIBRARY_PATH + +Step 6: Create a single “install” directory for all Pegasus software +-------------------------------------------------------------------- + +This will help in resolving errors like “cannot find .. in your path”. + +6.1 Move into the Pegasus directory (the one you compiled from source) and make a directory called install: + +.. code-block:: bash + + cd pegasus + mkdir install + +6.2 Copy all components from pegasus-5.0.7 and condor into the :code:`pegasus/install` folder: + +.. code-block:: bash + + cd ../condor + cp * ../pegasus/install + cp -r * ../pegasus/install + cd ../pegasus-5.0.7 + cp * ../pegasus/install + cp -r * ../pegasus/install + +.. note:: + + If you encounter errors about overwriting :code:`/bin` or :code:`/lib` folders, you have to do it manually by cd into those folders and copying everything to :code:`/pegasus/install/bin` or :code:`/pegasus/install/lib`. Make sure all components are there, otherwise Pegasus and Condor cannot run. + +Step 7: Prepare software for Pegasus-Montage +---------------------------------- + +7.1 Make sure you are in the virtual environment still. If not, source it again by repeating 5.2. + +7.2 Save to PATH: + +.. code-block:: bash + + export PATH=/path/to/pegasus/install/bin:$PATH + export PATH=/path/to/pegasus/install/sbin:$PATH + export LD_LIBRARY_PATH=/path/to/pegasus/install//lib:$LD_LIBRARY_PATH + source ~/.bashrc + +7.3 Run Condor: + +.. code-block:: bash + + chmod 777 /path/to/pegasus/install/condor.sh + . /path/to/pegasus/install/condor.sh + condor_master + condor_status # it should show the activity + condor_q # it should show the jobs running + +.. note:: + + If errors occur, echo the `LD_LIBRARY_PATH` and the `PATH` and make sure :code:`/pegasus/install` is there. + +To check if condor_shedd and all other condor processes are running: + +.. code-block:: bash + + ps aux | grep condor + +If Condor throws errors while trying to connect to another node: + +1. Exit the flux allocation: + +.. code-block:: bash + + exit + +2. Check your processes: + +.. code-block:: bash + + ps -u ${USER} + +3. Kill all your processes (or those related to Condor if any): + +.. code-block:: bash + + killall -u ${USER} + +4. Repeat steps 6.3, 6.4, 5.2, 6.5 + +5. If the problem persists: + +.. code-block:: bash + + condor_restart + +7.4 Test Pegasus: + +.. code-block:: bash + + pegasus-version # should show 5.0.7 + +.. note:: + +If error "Cannot find file with permissions" occurs, touch that file and make sure it has those permissions. + +7.5 Configure the Condor/SLURM interface: + +.. code-block:: bash + + pegasus-configure-glite + +.. note:: + + If error "Cannot find file with permissions" occurs, touch that file and make sure it has those permissions. + +7.6 Configure the DFTracer flags: + +.. code-block:: bash + + export DFTRACER_INSTALLED=/path/to/pegasus-env/lib/python3.9/site-packages/dftracer/ + export LD_LIBRARY_PATH=$DFTRACER_INSTALLED/lib:$DFTRACER_INSTALLED/lib64:$LD_LIBRARY_PATH + export DFTRACER_LOG_FILE=/path/to/traces/trace + # export DFTRACER_DATA_DIR=all (optional) + export DFTRACER_ENABLE=1 + export DFTRACER_INC_METADATA=1 + # export DFTRACER_INIT=PRELOAD (optional) + export DFTRACER_BIND_SIGNALS=0 + # export DFTRACER_LOG_LEVEL=ERROR (optional) + export DFTRACER_TRACE_COMPRESSION=1 + # dftracer=$DFTRACER_INSTALLED/lib64/libdftracer_preload.so (optional) + +You would only need to use the preload version of DFTracer if you have not annotated the application code you are running. +For more information on the flags and their functionalities please turn to :docs:`examples`. + +Step 8: Annotate Montage +---------------------------------- + +8.1 Navigate to the :code:`/path/to/Monatge` directory. Most of the source code that is useful to annotate and "time" for Monatge is in the `/Monatge` and `/MoantgeLb` folder. As an example we use the `mDiff.c` application which is located in :code:`/path/to/Monatge/Monatge` folder. The first step is to link the source code with DFTracer. For that we edit the `Makefile` located in the same folder as following: + +.. code-block:: make + + .SUFFIXES: + .SUFFIXES: .c .o + + CC = gcc + MPICC = + CFLAGS = -g -O2 -I. -I../lib/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -std=c99 + LIBS = -L../lib -lwcs -lcfitsio -lcoord -lmtbl -lsvc \ + -lwww -lboundaries -lpixbounds -ltwoplane -lm + + # Define flags + DF_CFLAGS = -I/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dftracer/include + DF_LDFLAGS = -L/usr/workspace/iopp/kogiou1/venvs/pegasus-env/lib/python3.9/site-packages/dftracer/lib64 -ldftracer + + # Add flags to CFLAGS and LIBS + CFLAGS += $(DF_CFLAGS) + LIBS += $(DF_LDFLAGS) + + + SBINS = mConvert mFlattenExec mHdrCheck mHdrtbl mTblSort mTileHdr mTileImage + MBINS = mProjExecMPI mFitExecMPI mDiffExecMPI mBgExecMPI mAddMPI mAddExecMPI + + BINS = $(SBINS) + + + # uncomment the next two lines to build MPI modules + MPICC = mpicc + BINS = $(SBINS) $(MBINS) + + + .c.o: + $(CC) $(CFLAGS) -c $*.c + + mDiff: mDiff.o debugCheck.o checkHdr.o checkWCS.o + $(CC) -o mDiff mDiff.o debugCheck.o checkHdr.o checkWCS.o \ + $(LIBS) + +8.2 Edit the :code:`/path/to/Monatge/mDiff.c` file as follows: + +.. code-block:: c + + #include + ... + int main(int argc, char **argv) + { + DFTRACER_C_INIT(NULL, NULL, NULL); + ... + if (MPI_err != 0) { + printf("[struct stat=\"ERROR\", msg=\"MPI initialization failed\"]\n"); + DFTRACER_C_FINI(); + exit(1); + } + ... + DFTRACER_C_FINI(); + exit(1); + } + +The idea is to initialize DFTracer at the start of the `main` function and then finilize DFTracer before we exit the function so that we can capture the time, as well as the application and the system calls with DFTracer that take place while mDiff application is running. We can annotate further, by creating regions. For more details please refer to :doc:`examples.rst`. + +8.3 After annotating the code we need to compile Montage using DFTracer. For that we need to source the Python environment that we have used to install DFTracer in and install it from source too (to access DFAnalyzer Jupyter Notebook). For details on that please refer to :doc:`build`. We then compile Montage as normal: + +.. code-block:: bash + + cd /path/to/Montage/ + make + +8.4 After the compilation with DFTracer, we can run Montage with pegasus after navigating to :code:`/path/to/montage-pegasus-v3/example-2mass.sh`. The steps are that we ceate the data by running `montage-workflow.py`, then plan the workflow and then run it with `pegasus-run`. For more information please visit ``. + +8.5 After the workflow finishes we navigate into :code:`/path/to/traces/` that we set earlier with the DFTracer flags. We then load those traces on DFAnalyzer. Depending on which application or system call was first, the traces will contain information regarding the "Level" of the call and "args:p_idx". The application calls will also have "CAT: C_APP" as this is a C code workflow. These can be used to further create a graph of calls. Here is the result of tracing Montage on LC Corona with 2 nodes and 55 processes using DFTracer and analyzing the tracing using DFAnalyzer: + +.. image:: images/tracing/Montage_summary.png + :width: 800 + :alt: Summary generated by DFAnalyzer for Montage. + +.. image:: images/tracing/Montage_graph.png + :width: 800 + :alt: Two-level graph of application calls for Monatge. + +.. image:: images/tracing/Montage_dur.png + :width: 800 + :alt: Aggreagte duration of traced calls for Montage. \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 4170c03e..70b23e3d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,2 @@ -sphinx-rtd-theme \ No newline at end of file +sphinx-rtd-theme +Sphinx<7 \ No newline at end of file diff --git a/docs/utilities.rst b/docs/utilities.rst new file mode 100644 index 00000000..4db5650b --- /dev/null +++ b/docs/utilities.rst @@ -0,0 +1,50 @@ +======================== +DFTracer Utility scripts +======================== + +This section describes the utilities provided by DFTracer to assist users with logs. + +---------- + +All scripts are installed with DFTracer in the installation's directories bin folder. + +------------------ +Merge Trace script +------------------ + +This script allows users to combine all pfw format into one. +This has the following signature. + +.. code-block:: bash + + /bin/merge_pfw [-fcv] [-d input_directory] [-o output_file] + +Arguments for this script are + +1. **-d input_directory** folder containing all trace files. Default `PWD`. +2. **-o output_file** file for storing merged file. Default `combined.pfw`. +3. **-f** override output file. +4. **-c** compress output file. +5. **-v** enable verbose mode. +6. **-h** display help + +------------------ +Compaction script +------------------ + +The script compacts all trace file and then divides the trace into equal file pieces. + +.. code-block:: bash + + /bin/dftracer_compact [-fcv] [-d input_directory] [-o output_directory] [-l num_lines] [-p prefix] + +Arguments for this script are + +1. **-d input_directory** specify input directories. Should contain .pfw or .pfw.gz files. Default `PWD`. +2. **-o output_file** specify output directory. Default `combined.pfw`. +3. **-l num_lines** lines per trace. +4. **-p prefix** prefix to be used for compact files. +5. **-f** override output directory. +6. **-c** compress output file. +7. **-v** enable verbose mode. +8. **-h** display help diff --git a/examples/dfanalyzer/dfanalyzer-distributed.ipynb b/examples/dfanalyzer/dfanalyzer-distributed.ipynb new file mode 100644 index 00000000..4fd0dee3 --- /dev/null +++ b/examples/dfanalyzer/dfanalyzer-distributed.ipynb @@ -0,0 +1,699 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b4470223", + "metadata": {}, + "source": [ + "# DFAnalyzer Simple Example\n", + "\n", + "This notebook will guide you to load a trace file generated by DFTracer and analyze the trace events using Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "86ed50dc-d4d6-4e78-be69-1d55b8362a46", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "16132659", + "metadata": {}, + "source": [ + "## System imports for the notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "432c079e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import sys\n", + "import yaml\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "9ece9f90", + "metadata": {}, + "source": [ + "## We add the analysis code to path so that we can run this in dev mode." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4a5811b8", + "metadata": {}, + "outputs": [], + "source": [ + "home = os.environ[\"HOME\"]\n", + "\n", + "with open(f\"{home}/.dftracer/configuration.yaml\", \"r\") as file:\n", + " dft_yaml = yaml.safe_load(file)\n", + " app_root = dft_yaml[\"app\"]\n", + "sys.path.insert(0, app_root)" + ] + }, + { + "cell_type": "markdown", + "id": "446ebe05", + "metadata": {}, + "source": [ + "## Imports for the notebook\n", + "\n", + "This may take some time as it initializes Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a47492d-40d0-4dea-b1a2-aa1f083239e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/__init__.py\n" + ] + } + ], + "source": [ + "# Importing DFAnalyzer\n", + "import dfanalyzer\n", + "\n", + "print(dfanalyzer.__file__)\n", + "from dfanalyzer.main import (\n", + " DFAnalyzer,\n", + " update_dft_configuration,\n", + " setup_logging,\n", + " setup_dask_cluster,\n", + " reset_dask_cluster,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0e236854", + "metadata": {}, + "source": [ + "## Initialize DFAnalyzer Configuration\n", + "\n", + "In this function, we can tune DFAnalyzer for the analysis. For example, we can tune number of workers, connect to existing dask cluster, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f1aff029", + "metadata": {}, + "outputs": [], + "source": [ + "def get_conditions_stormer(json_object):\n", + " app_io_cond = (\n", + " \"__getitem__\" in json_object[\"name\"]\n", + " ) # I/O has that application is issuing\n", + " compute_cond = \"compute\" in json_object[\"cat\"]\n", + " io_cond = json_object[\"cat\"] in [\"POSIX\", \"STDIO\"]\n", + " return app_io_cond, compute_cond, io_cond" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cc5aee73", + "metadata": {}, + "outputs": [], + "source": [ + "dask_run_dir = os.path.join(app_root, \"dfanalyzer\", \"dask\", \"run_dir\")\n", + "with open(os.path.join(dask_run_dir, f\"scheduler_{os.getenv('USER')}.json\"), \"r\") as f:\n", + " dask_scheduler = json.load(f)[\"address\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d3a7cf5b-fbbc-4eee-9ed5-ce869f4541e6", + "metadata": {}, + "outputs": [], + "source": [ + "conf = update_dft_configuration(\n", + " dask_scheduler=dask_scheduler,\n", + " verbose=True,\n", + " workers=4,\n", + " time_granularity=80e6,\n", + " log_file=f\"./df_{os.getenv('USER')}.log\",\n", + " conditions=get_conditions_stormer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8478828a", + "metadata": {}, + "source": [ + "## This methods sets up logging for DFAnalyzer.\n", + "\n", + "This is needed for debugging and progress tracking. All prints seen in the following cells are configured in this method." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "04428959-820f-4466-8ceb-778bcce93bf0", + "metadata": {}, + "outputs": [], + "source": [ + "setup_logging()" + ] + }, + { + "cell_type": "markdown", + "id": "252225e3", + "metadata": {}, + "source": [ + "## Setup dask cluster.\n", + "\n", + "In this example, we use Dask Local cluster which will use multiprocessing on the same node where the notebook is running to run its workers.\n", + "\n", + "**NOTE:** If your running on Remote VSCode on a cluster, you can tunnel the port and open it locally." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "68e813b7-6b95-4f31-b223-1fb50774db50", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [21:08:40] Initialized Client with 16 workers and link http://140.221.112.12:8787/status [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:665]\n" + ] + } + ], + "source": [ + "setup_dask_cluster()" + ] + }, + { + "cell_type": "markdown", + "id": "237d458d", + "metadata": {}, + "source": [ + "On clicking the link, you will see a daskboard like this. [Dask Daskboard Image](images/dask-dashboard-load.png)" + ] + }, + { + "cell_type": "markdown", + "id": "6207690f", + "metadata": {}, + "source": [ + "## Reset Dask Cluster\n", + "\n", + "In case you have an error and want to clean the cluster for fresh analysis. You can run this." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "236e50a4-03b6-4895-a0a5-d473f5e391b5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [21:08:47] Restarting all workers [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:657]\n" + ] + } + ], + "source": [ + "reset_dask_cluster()" + ] + }, + { + "cell_type": "markdown", + "id": "07951b56", + "metadata": {}, + "source": [ + "## Load the DFAnalyzer Trace\n", + "\n", + "The DFAnalyzer class take a regex string as input. For example, \"{app_root}/examples/dfanalyzer/*.pfw.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b37727ee-a221-43ab-81e1-cdf98c2cf314", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [21:08:49] Created index for 16 files [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:366]\n", + "[INFO] [21:08:49] Total size of all files are bytes [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:368]\n", + "[INFO] [21:08:49] Loading 64 batches out of 16 files and has 848888 lines overall [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:381]\n", + "[INFO] [21:08:52] Loaded events [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:423]\n", + "[INFO] [21:08:52] Loaded plots with slope threshold: 45 [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:429]\n" + ] + } + ], + "source": [ + "analyzer = DFAnalyzer(f\"{app_root}/examples/dfanalyzer/test-trace-distributed/*.pfw.gz\")" + ] + }, + { + "cell_type": "markdown", + "id": "23d520b4", + "metadata": {}, + "source": [ + "## Analyze the events\n", + "\n", + "1. The dask dataframe is stored at `analyzer.events`. \n", + "2. We can run dask queries on this dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4c0ffe10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecatpidtidtstedurtintervaltrangehostnamecompute_timeio_timeapp_io_timetotal_timefilenamephasesize
0openPOSIX10757412151482917891868<NA>0.0x3012c0s7b0n0<NA>8<NA>8/dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-...2<NA>
1openPOSIX1075741215148210815108216<NA>0.0x3012c0s7b0n0<NA>6<NA>6/dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-...2<NA>
2__fxstatPOSIX1075741215148210849108501<NA>0.0x3012c0s7b0n0<NA>1<NA>1/dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-...2<NA>
3mmapPOSIX1075741215148210868108768<NA>0.0x3012c0s7b0n0<NA>8<NA>8/dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-...2<NA>
4closePOSIX1075741215148210902109031<NA>0.0x3012c0s7b0n0<NA>1<NA>1/dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-...2<NA>
\n", + "
" + ], + "text/plain": [ + " name cat pid tid ts te dur tinterval trange \\\n", + "0 open POSIX 1075741 2151482 9178 9186 8 0.0 \n", + "1 open POSIX 1075741 2151482 10815 10821 6 0.0 \n", + "2 __fxstat POSIX 1075741 2151482 10849 10850 1 0.0 \n", + "3 mmap POSIX 1075741 2151482 10868 10876 8 0.0 \n", + "4 close POSIX 1075741 2151482 10902 10903 1 0.0 \n", + "\n", + " hostname compute_time io_time app_io_time total_time \\\n", + "0 x3012c0s7b0n0 8 8 \n", + "1 x3012c0s7b0n0 6 6 \n", + "2 x3012c0s7b0n0 1 1 \n", + "3 x3012c0s7b0n0 8 8 \n", + "4 x3012c0s7b0n0 1 1 \n", + "\n", + " filename phase size \n", + "0 /dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-... 2 \n", + "1 /dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-... 2 \n", + "2 /dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-... 2 \n", + "3 /dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-... 2 \n", + "4 /dev/shm/shared_memory.PMI.e6389e0f-f55b-400a-... 2 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzer.events.head()" + ] + }, + { + "cell_type": "markdown", + "id": "006bfab2", + "metadata": {}, + "source": [ + "### Summary \n", + "\n", + "DFAnalyzer supports a summary utility that gives a brief summary of the job and its I/O access behavior." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9350218f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [21:08:55] Total number of events in the workload are 848856 [/eagle/MDClimSim/rayandrew/dftracer/dfanalyzer/main.py:521]\n" + ] + }, + { + "data": { + "text/html": [ + "
╭──────────────────────────────────────────────────── Summary ────────────────────────────────────────────────────╮\n",
+                            "│  Allocation    Scheduler Allocation Details                                                                     │\n",
+                            "│                ├── Nodes: 2                                                                                     │\n",
+                            "│                ├── Processes: 8                                                                                 │\n",
+                            "│                ├── Thread allocations across nodes (includes dynamically created threads)                       │\n",
+                            "│                │   ├── Compute: 8                                                                               │\n",
+                            "│                │   └── I/O: 813                                                                                 │\n",
+                            "│                └── Events Recorded: 849K                                                                        │\n",
+                            "│  Dataset       Description of Dataset Used                                                                      │\n",
+                            "│                └── Files: 13810                                                                                 │\n",
+                            "│  I/O Behavior  Behavior of Application                                                                          │\n",
+                            "│                ├── Split of Time in application                                                                 │\n",
+                            "│                │   ├── Total Time: 381.544 sec                                                                  │\n",
+                            "│                │   ├── Overall App Level I/O: 162.443 sec                                                       │\n",
+                            "│                │   ├── Unoverlapped App I/O: 15.205 sec                                                         │\n",
+                            "│                │   ├── Unoverlapped App Compute: 116.012 sec                                                    │\n",
+                            "│                │   ├── Compute: 263.250 sec                                                                     │\n",
+                            "│                │   ├── Overall I/O: 126.725 sec                                                                 │\n",
+                            "│                │   ├── Unoverlapped I/O: 10.598 sec                                                             │\n",
+                            "│                │   └── Unoverlapped Compute: 147.122 sec                                                        │\n",
+                            "│                └── Metrics by function                                                                          │\n",
+                            "│                    ├── Function       |count |                  size                   |                        │\n",
+                            "│                    ├──                |      |min   |25    |mean  |median|75    |max   |                        │\n",
+                            "│                    ├── open           |10K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── __fxstat       |3K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── mmap           |16    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── close          |10K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── unlink         |11K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── open64         |24    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── __fxstat64     |19K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── lseek64        |25K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── read           |144   |NA    |22    |13KB  |5KB   |36KB  |36KB  |                        │\n",
+                            "│                    ├── opendir        |24    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── __xstat64      |2K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── link           |2K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── pread          |717K  |8     |512   |1MB   |512   |4MB   |4MB   |                        │\n",
+                            "│                    ├── __lxstat       |3K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── write          |6K    |1     |1     |1     |1     |1     |1     |                        │\n",
+                            "│                    ├── mmap64         |6K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── fcntl          |6K    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    └── ftruncate      |12    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+                            "
\n" + ], + "text/plain": [ + "╭──────────────────────────────────────────────────── Summary ────────────────────────────────────────────────────╮\n", + "│ \u001b[36m \u001b[0m\u001b[36mAllocation \u001b[0m\u001b[36m \u001b[0m Scheduler Allocation Details │\n", + "│ \u001b[36m \u001b[0m ├── Nodes: 2 │\n", + "│ \u001b[36m \u001b[0m ├── Processes: 8 │\n", + "│ \u001b[36m \u001b[0m ├── Thread allocations across nodes (includes dynamically created threads) │\n", + "│ \u001b[36m \u001b[0m │ ├── Compute: 8 │\n", + "│ \u001b[36m \u001b[0m │ └── I/O: 813 │\n", + "│ \u001b[36m \u001b[0m └── Events Recorded: 849K │\n", + "│ \u001b[36m \u001b[0m\u001b[36mDataset \u001b[0m\u001b[36m \u001b[0m Description of Dataset Used │\n", + "│ \u001b[36m \u001b[0m └── Files: 13810 │\n", + "│ \u001b[36m \u001b[0m\u001b[36mI/O Behavior\u001b[0m\u001b[36m \u001b[0m Behavior of Application │\n", + "│ \u001b[36m \u001b[0m ├── Split of Time in application │\n", + "│ \u001b[36m \u001b[0m │ ├── Total Time: 381.544 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Overall App Level I/O: 162.443 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Unoverlapped App I/O: 15.205 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Unoverlapped App Compute: 116.012 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Compute: 263.250 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Overall I/O: 126.725 sec │\n", + "│ \u001b[36m \u001b[0m │ ├── Unoverlapped I/O: 10.598 sec │\n", + "│ \u001b[36m \u001b[0m │ └── Unoverlapped Compute: 147.122 sec │\n", + "│ \u001b[36m \u001b[0m └── Metrics by function │\n", + "│ \u001b[36m \u001b[0m ├── Function |count | size | │\n", + "│ \u001b[36m \u001b[0m ├── | |min |25 |mean |median|75 |max | │\n", + "│ \u001b[36m \u001b[0m ├── open |10K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── __fxstat |3K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── mmap |16 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── close |10K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── unlink |11K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── open64 |24 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── __fxstat64 |19K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── lseek64 |25K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── read |144 |NA |22 |13KB |5KB |36KB |36KB | │\n", + "│ \u001b[36m \u001b[0m ├── opendir |24 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── __xstat64 |2K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── link |2K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── pread |717K |8 |512 |1MB |512 |4MB |4MB | │\n", + "│ \u001b[36m \u001b[0m ├── __lxstat |3K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── write |6K |1 |1 |1 |1 |1 |1 | │\n", + "│ \u001b[36m \u001b[0m ├── mmap64 |6K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── fcntl |6K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m └── ftruncate |12 |NA |nan |nan |NA |nan |NA | │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "items = analyzer.summary()\n", + "items" + ] + }, + { + "cell_type": "markdown", + "id": "2ab18972", + "metadata": {}, + "source": [ + "### Timeline plots\n", + "\n", + "We support two timeline plots:\n", + "1. how I/O time and I/O bandwidth changes over time.\n", + "2. how transfer size changes over time." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b9185f98", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax1, ax2 = analyzer.plots.time_bw_timeline(\n", + " bw_unit=\"gb\",\n", + " figsize=(8, 3),\n", + " line1_label=\"POSIX I/O Time\",\n", + " line2_label=\"POSIX I/O Bandwidth\",\n", + " time_col=\"io_time\",\n", + " x_num_ticks=8,\n", + " y_num_ticks=5,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "08b58161", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = analyzer.plots.xfer_size_timeline(\n", + " figsize=(8, 3),\n", + " unit=\"mb\",\n", + " x_num_ticks=8,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/dfanalyzer/dlp_analyzer.ipynb b/examples/dfanalyzer/dfanalyzer-ideas.ipynb similarity index 99% rename from examples/dfanalyzer/dlp_analyzer.ipynb rename to examples/dfanalyzer/dfanalyzer-ideas.ipynb index 925b7a3e..0dc731c7 100644 --- a/examples/dfanalyzer/dlp_analyzer.ipynb +++ b/examples/dfanalyzer/dfanalyzer-ideas.ipynb @@ -109,7 +109,7 @@ "source": [ "import dfanalyzer\n", "print(dfanalyzer.__file__)\n", - "from dfanalyzer.main import DFAnalyzer,get_df_configuration,update_df_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_df_configuration" + "from dfanalyzer.main import DFAnalyzer,get_dft_configuration,update_dft_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dft_configuration" ] }, { @@ -207,7 +207,7 @@ "metadata": {}, "outputs": [], "source": [ - "conf = update_df_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=16,\n", + "conf = update_dft_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=16,\n", " log_file=f\"./df_{os.getenv('USER')}.log\", rebuild_index=False, time_approximate=True, \n", " host_pattern=r'lassen(\\d+)', time_granularity=30e6, skip_hostname=True, conditions=condition_fn)\n" ] @@ -232,7 +232,7 @@ } ], "source": [ - "conf = get_df_configuration()\n", + "conf = get_dft_configuration()\n", "conf.time_approximate" ] }, @@ -1740,7 +1740,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -2159,4 +2159,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/dfanalyzer/dfanalyzer.ipynb b/examples/dfanalyzer/dfanalyzer.ipynb new file mode 100644 index 00000000..b521ea4a --- /dev/null +++ b/examples/dfanalyzer/dfanalyzer.ipynb @@ -0,0 +1,628 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b4470223", + "metadata": {}, + "source": [ + "# DFAnalyzer Simple Example\n", + "\n", + "This notebook will guide you to load a trace file generated by DFTracer and analyze the trace events using Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "86ed50dc-d4d6-4e78-be69-1d55b8362a46", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "16132659", + "metadata": {}, + "source": [ + "## System imports for the notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "432c079e", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import os\n", + "from pathlib import Path\n", + "import sys" + ] + }, + { + "cell_type": "markdown", + "id": "9ece9f90", + "metadata": {}, + "source": [ + "## We add the analysis code to path so that we can run this in dev mode." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4a5811b8", + "metadata": {}, + "outputs": [], + "source": [ + "app_root = str(Path(os.getcwd()).parent.parent)\n", + "sys.path.insert(0, app_root)" + ] + }, + { + "cell_type": "markdown", + "id": "446ebe05", + "metadata": {}, + "source": [ + "## Imports for the notebook\n", + "\n", + "This may take some time as it initializes Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a47492d-40d0-4dea-b1a2-aa1f083239e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/WS2/haridev/dftracer/dfanalyzer/__init__.py\n" + ] + } + ], + "source": [ + "# Importing DFAnalyzer\n", + "import dfanalyzer\n", + "print(dfanalyzer.__file__)\n", + "from dfanalyzer.main import DFAnalyzer, update_dft_configuration, setup_logging, setup_dask_cluster, reset_dask_cluster" + ] + }, + { + "cell_type": "markdown", + "id": "0e236854", + "metadata": {}, + "source": [ + "## Initialize DFAnalyzer Configuration\n", + "\n", + "In this function, we can tune DFAnalyzer for the analysis. For example, we can tune number of workers, connect to existing dask cluster, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d3a7cf5b-fbbc-4eee-9ed5-ce869f4541e6", + "metadata": {}, + "outputs": [], + "source": [ + "conf = update_dft_configuration(verbose=True, workers=4, log_file=f\"./df_{os.getenv('USER')}.log\")" + ] + }, + { + "cell_type": "markdown", + "id": "8478828a", + "metadata": {}, + "source": [ + "## This methods sets up logging for DFAnalyzer.\n", + "\n", + "This is needed for debugging and progress tracking. All prints seen in the following cells are configured in this method." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "04428959-820f-4466-8ceb-778bcce93bf0", + "metadata": {}, + "outputs": [], + "source": [ + "setup_logging()" + ] + }, + { + "cell_type": "markdown", + "id": "252225e3", + "metadata": {}, + "source": [ + "## Setup dask cluster.\n", + "\n", + "In this example, we use Dask Local cluster which will use multiprocessing on the same node where the notebook is running to run its workers.\n", + "\n", + "**NOTE:** If your running on Remote VSCode on a cluster, you can tunnel the port and open it locally." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "68e813b7-6b95-4f31-b223-1fb50774db50", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [10:11:57] Initialized Client with 4 workers and link http://127.0.0.1:8787/status [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:669]\n" + ] + } + ], + "source": [ + "setup_dask_cluster()" + ] + }, + { + "cell_type": "markdown", + "id": "237d458d", + "metadata": {}, + "source": [ + "On clicking the link, you will see a daskboard like this. [Dask Daskboard Image](images/dask-dashboard-load.png)" + ] + }, + { + "cell_type": "markdown", + "id": "6207690f", + "metadata": {}, + "source": [ + "## Reset Dask Cluster\n", + "\n", + "In case you have an error and want to clean the cluster for fresh analysis. You can run this." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "236e50a4-03b6-4895-a0a5-d473f5e391b5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-10 10:11:57,752 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-10 10:11:57,768 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-10 10:11:57,797 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-10 10:11:57,815 - distributed.nanny - WARNING - Restarting worker\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [10:11:59] Restarting all workers [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:657]\n" + ] + } + ], + "source": [ + "reset_dask_cluster()" + ] + }, + { + "cell_type": "markdown", + "id": "07951b56", + "metadata": {}, + "source": [ + "## Load the DFAnalyzer Trace\n", + "\n", + "The DFAnalyzer class take a regex string as input. For example, \"{app_root}/examples/dfanalyzer/*.pfw.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b37727ee-a221-43ab-81e1-cdf98c2cf314", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [10:12:00] Created index for 1 files [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:366]\n", + "[INFO] [10:12:00] Total size of all files are bytes [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:368]\n", + "[INFO] [10:12:00] Loading 6 batches out of 1 files and has 93895 lines overall [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:381]\n", + "[INFO] [10:12:03] Loaded events [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:423]\n", + "[INFO] [10:12:03] Loaded plots with slope threshold: 45 [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:429]\n" + ] + } + ], + "source": [ + "analyzer = DFAnalyzer(f\"{app_root}/examples/dfanalyzer/test-trace.pfw.gz\")" + ] + }, + { + "cell_type": "markdown", + "id": "23d520b4", + "metadata": {}, + "source": [ + "## Analyze the events\n", + "\n", + "1. The dask dataframe is stored at `analyzer.events`. \n", + "2. We can run dask queries on this dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c0ffe10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecatpidtidtstedurtintervaltrangehostnamecompute_timeio_timeapp_io_timetotal_timefilenamephasesize
0TorchFramework.__init__ai_framework0338684620525862052668<NA>6.0ruby165<NA><NA><NA>0<NA>0<NA>
1TorchFramework.is_nativeio_availableai_framework0338684620529362052941<NA>6.0ruby165<NA><NA><NA>0<NA>0<NA>
2FileStorage.__init__storage03386841962053096205290<NA>0.0ruby165<NA><NA><NA>0<NA>0<NA>
3FileStorage.create_namespacestorage03386846205338620536527<NA>6.0ruby165<NA><NA><NA>0<NA>0<NA>
4DLIOBenchmark.__init__dlio_benchmark0338684074845737484573<NA>0.0ruby165<NA><NA><NA>0<NA>0<NA>
\n", + "
" + ], + "text/plain": [ + " name cat pid tid ts \\\n", + "0 TorchFramework.__init__ ai_framework 0 338684 6205258 \n", + "1 TorchFramework.is_nativeio_available ai_framework 0 338684 6205293 \n", + "2 FileStorage.__init__ storage 0 338684 19 \n", + "3 FileStorage.create_namespace storage 0 338684 6205338 \n", + "4 DLIOBenchmark.__init__ dlio_benchmark 0 338684 0 \n", + "\n", + " te dur tinterval trange hostname compute_time io_time \\\n", + "0 6205266 8 6.0 ruby165 \n", + "1 6205294 1 6.0 ruby165 \n", + "2 6205309 6205290 0.0 ruby165 \n", + "3 6205365 27 6.0 ruby165 \n", + "4 7484573 7484573 0.0 ruby165 \n", + "\n", + " app_io_time total_time filename phase size \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzer.events.head()" + ] + }, + { + "cell_type": "markdown", + "id": "006bfab2", + "metadata": {}, + "source": [ + "### Summary \n", + "\n", + "DFAnalyzer supports a summary utility that gives a brief summary of the job and its I/O access behavior." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9350218f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [10:12:07] Total number of events in the workload are 93893 [/usr/WS2/haridev/dftracer/dfanalyzer/main.py:521]\n" + ] + }, + { + "data": { + "text/html": [ + "
╭──────────────────────────────────────────────────── Summary ────────────────────────────────────────────────────╮\n",
+                            "│  Allocation    Scheduler Allocation Details                                                                     │\n",
+                            "│                ├── Nodes: 1                                                                                     │\n",
+                            "│                ├── Processes: 1                                                                                 │\n",
+                            "│                ├── Thread allocations across nodes (includes dynamically created threads)                       │\n",
+                            "│                │   ├── Compute: 0                                                                               │\n",
+                            "│                │   └── I/O: 21                                                                                  │\n",
+                            "│                └── Events Recorded: 94K                                                                         │\n",
+                            "│  Dataset       Description of Dataset Used                                                                      │\n",
+                            "│                └── Files: 103                                                                                   │\n",
+                            "│  I/O Behavior  Behavior of Application                                                                          │\n",
+                            "│                ├── Split of Time in application                                                                 │\n",
+                            "│                │   ├── Total Time: 105.211 sec                                                                  │\n",
+                            "│                │   └── Overall I/O: 54.896 sec                                                                  │\n",
+                            "│                └── Metrics by function                                                                          │\n",
+                            "│                    ├── Function       |count |                  size                   |                        │\n",
+                            "│                    ├──                |      |min   |25    |mean  |median|75    |max   |                        │\n",
+                            "│                    ├── opendir        |42    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── __xstat64      |42    |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── open64         |139   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── __fxstat64     |278   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── lseek64        |82K   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "│                    ├── read           |6K    |NA    |4MB   |4MB   |4MB   |4MB   |4MB   |                        │\n",
+                            "│                    └── close          |119   |NA    |nan   |nan   |NA    |nan   |NA    |                        │\n",
+                            "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+                            "
\n" + ], + "text/plain": [ + "╭──────────────────────────────────────────────────── Summary ────────────────────────────────────────────────────╮\n", + "│ \u001b[36m \u001b[0m\u001b[36mAllocation \u001b[0m\u001b[36m \u001b[0m Scheduler Allocation Details │\n", + "│ \u001b[36m \u001b[0m ├── Nodes: 1 │\n", + "│ \u001b[36m \u001b[0m ├── Processes: 1 │\n", + "│ \u001b[36m \u001b[0m ├── Thread allocations across nodes (includes dynamically created threads) │\n", + "│ \u001b[36m \u001b[0m │ ├── Compute: 0 │\n", + "│ \u001b[36m \u001b[0m │ └── I/O: 21 │\n", + "│ \u001b[36m \u001b[0m └── Events Recorded: 94K │\n", + "│ \u001b[36m \u001b[0m\u001b[36mDataset \u001b[0m\u001b[36m \u001b[0m Description of Dataset Used │\n", + "│ \u001b[36m \u001b[0m └── Files: 103 │\n", + "│ \u001b[36m \u001b[0m\u001b[36mI/O Behavior\u001b[0m\u001b[36m \u001b[0m Behavior of Application │\n", + "│ \u001b[36m \u001b[0m ├── Split of Time in application │\n", + "│ \u001b[36m \u001b[0m │ ├── Total Time: 105.211 sec │\n", + "│ \u001b[36m \u001b[0m │ └── Overall I/O: 54.896 sec │\n", + "│ \u001b[36m \u001b[0m └── Metrics by function │\n", + "│ \u001b[36m \u001b[0m ├── Function |count | size | │\n", + "│ \u001b[36m \u001b[0m ├── | |min |25 |mean |median|75 |max | │\n", + "│ \u001b[36m \u001b[0m ├── opendir |42 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── __xstat64 |42 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── open64 |139 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── __fxstat64 |278 |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── lseek64 |82K |NA |nan |nan |NA |nan |NA | │\n", + "│ \u001b[36m \u001b[0m ├── read |6K |NA |4MB |4MB |4MB |4MB |4MB | │\n", + "│ \u001b[36m \u001b[0m └── close |119 |NA |nan |nan |NA |nan |NA | │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "items = analyzer.summary()\n", + "items" + ] + }, + { + "cell_type": "markdown", + "id": "2ab18972", + "metadata": {}, + "source": [ + "### Timeline plots\n", + "\n", + "We support two timeline plots:\n", + "1. how I/O time and I/O bandwidth changes over time.\n", + "2. how transfer size changes over time." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b9185f98", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax1, ax2 = analyzer.plots.time_bw_timeline(\n", + " bw_unit='gb',\n", + " figsize=(8, 3),\n", + " line1_label='POSIX I/O Time',\n", + " line2_label='POSIX I/O Bandwidth',\n", + " time_col='io_time',\n", + " x_num_ticks=8,\n", + " y_num_ticks=5,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "08b58161", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = analyzer.plots.xfer_size_timeline(\n", + " figsize=(8, 3),\n", + " unit='mb',\n", + " x_num_ticks=8,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/dfanalyzer/images/bw_timeline.png b/examples/dfanalyzer/images/bw_timeline.png new file mode 100644 index 00000000..ab2c7093 Binary files /dev/null and b/examples/dfanalyzer/images/bw_timeline.png differ diff --git a/examples/dfanalyzer/images/dask-dashboard-load.png b/examples/dfanalyzer/images/dask-dashboard-load.png new file mode 100644 index 00000000..89732dac Binary files /dev/null and b/examples/dfanalyzer/images/dask-dashboard-load.png differ diff --git a/examples/dfanalyzer/images/xfer_timeline.png b/examples/dfanalyzer/images/xfer_timeline.png new file mode 100644 index 00000000..6f93447d Binary files /dev/null and b/examples/dfanalyzer/images/xfer_timeline.png differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-0.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-0.pfw.gz new file mode 100644 index 00000000..7c91a5cb Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-0.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-1.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-1.pfw.gz new file mode 100644 index 00000000..127ba8a4 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-1.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-10.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-10.pfw.gz new file mode 100644 index 00000000..82a4f080 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-10.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-11.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-11.pfw.gz new file mode 100644 index 00000000..5fdad7e8 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-11.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-12.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-12.pfw.gz new file mode 100644 index 00000000..59047965 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-12.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-13.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-13.pfw.gz new file mode 100644 index 00000000..a134f477 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-13.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-14.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-14.pfw.gz new file mode 100644 index 00000000..c24ca622 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-14.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-15.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-15.pfw.gz new file mode 100644 index 00000000..aabb1be5 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-15.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-2.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-2.pfw.gz new file mode 100644 index 00000000..16dd6f69 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-2.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-3.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-3.pfw.gz new file mode 100644 index 00000000..3c9f6696 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-3.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-4.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-4.pfw.gz new file mode 100644 index 00000000..b120d7e9 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-4.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz new file mode 100644 index 00000000..59ae000a Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz.zindex b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz.zindex new file mode 100644 index 00000000..b138540d Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-5.pfw.gz.zindex differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-6.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-6.pfw.gz new file mode 100644 index 00000000..fb0ffc4b Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-6.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-7.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-7.pfw.gz new file mode 100644 index 00000000..dbf56892 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-7.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-8.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-8.pfw.gz new file mode 100644 index 00000000..72e1bcdf Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-8.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace-distributed/test-trace-dist-9.pfw.gz b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-9.pfw.gz new file mode 100644 index 00000000..5ab644d4 Binary files /dev/null and b/examples/dfanalyzer/test-trace-distributed/test-trace-dist-9.pfw.gz differ diff --git a/examples/dfanalyzer/test-trace.pfw.gz b/examples/dfanalyzer/test-trace.pfw.gz new file mode 100644 index 00000000..edbd9a7c Binary files /dev/null and b/examples/dfanalyzer/test-trace.pfw.gz differ diff --git a/examples/graph_visualization/README.md b/examples/graph_visualization/README.md new file mode 100644 index 00000000..5748c38f --- /dev/null +++ b/examples/graph_visualization/README.md @@ -0,0 +1,49 @@ +# Ipycytoscape visualization for trace graph + +## Overview + +This example demonstrates how to represent traces(DFanalyzer events) as graphs and visualize them using ipycytoscape within a Jupyter Notebook environment. + +## Requirements + +To visualize ipycytoscape graphs in DFanalyzer, the following packages are required: + +- **networkx** +- **ipycytoscape** +- **ipywidgets** + +You can install these packages using pip: +```bash + pip install networkx ipycytoscape ipywidgets +``` + +## Graph creation and styling +We use folloing methods to convert the traces into graph and perform visualization. + +- **create_nx_graph**: + This method from *GraphFunctions* class takes dask dataframe (analyzer.events) and return a networkx graph. In this example, we define nodes as each event in dataframe, and edges between two + events represent the existance of overlapping time between the events. The definition of nodes/edges may be changed within this method for different use cases. + +- **visualize_graph**: + This method takes takes *networkx* graph object and a *CytoGraph* object. Two methods used from *CytoGraph* are + - ***get_json*** is used to convert nx graph into json format requried for ipycytoscape visualization. + - ***get_style*** is used for styling the cytoscape visualization. We can modify this method to insert different filters (different coloring and layouts) during the visualization. + + + +## Usage + +In this example, we used following trace event to represent as graph and visualize using ipycytoscape. Two different colors represent events related with two mount points. + +data + +The trace data is visualized using ipycytoscape, representing each event as a node, with connections indicating overlapping windows. +Nodes with a degree greater than 2 are styled in red; others in purple. + +vis + +## Additional Resource + +- Networkx +- Ipycytoscape + diff --git a/examples/graph_visualization/data.png b/examples/graph_visualization/data.png new file mode 100644 index 00000000..dbf886ca Binary files /dev/null and b/examples/graph_visualization/data.png differ diff --git a/examples/graph_visualization/example.ipynb b/examples/graph_visualization/example.ipynb new file mode 100644 index 00000000..872293b9 --- /dev/null +++ b/examples/graph_visualization/example.ipynb @@ -0,0 +1,643 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#imports \n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/workspace/pandey2/DFtracer/envdft/lib/python3.9/site-packages/dask/dataframe/__init__.py:49: FutureWarning: \n", + "Dask dataframe query planning is disabled because dask-expr is not installed.\n", + "\n", + "You can install it with `pip install dask[dataframe]` or `conda install dask`.\n", + "This will raise in a future version.\n", + "\n", + " warnings.warn(msg, FutureWarning)\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "import yaml\n", + "from pathlib import Path\n", + "from dask.distributed import Client\n", + "import dask.dataframe as dd\n", + "import networkx as nx\n", + "import sys\n", + "import pandas as pd\n", + "from IPython.display import Image\n", + "\n", + "import ipycytoscape\n", + "import ipywidgets as widgets\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/__init__.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [06:59:12] Initialized Client with 4 workers and link http://127.0.0.1:42171/status [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:676]\n", + "2024-07-15 06:59:12,321 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-15 06:59:12,344 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-15 06:59:12,347 - distributed.nanny - WARNING - Restarting worker\n", + "2024-07-15 06:59:12,386 - distributed.nanny - WARNING - Restarting worker\n", + "[INFO] [06:59:13] Restarting all workers [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:664]\n" + ] + } + ], + "source": [ + "\n", + "use_local=True\n", + "if not use_local:\n", + " with open(f'/g/g91/pandey2/.dftracer/configuration.yaml', 'r') as file:\n", + " dlp_yaml = yaml.safe_load(file)\n", + " app_root = dlp_yaml[\"app\"]\n", + "else:\n", + " app_root = str(Path(os.getcwd()).parent.parent)\n", + "sys.path.insert(0, app_root)\n", + "\n", + "\n", + "import dfanalyzer\n", + "print(dfanalyzer.__file__)\n", + "from dfanalyzer.main import DFAnalyzer,get_dft_configuration,update_dft_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dft_configuration\n", + "\n", + "\n", + "if not use_local:\n", + " dask_run_dir = os.path.join(app_root, \"dfanalyzer\", \"dask\", \"run_dir\")\n", + " with open (os.path.join(dask_run_dir, f\"scheduler_{os.getenv('USER')}.json\"), \"r\") as f:\n", + " dask_scheduler = json.load(f)[\"address\"]\n", + "else:\n", + " dask_scheduler = None\n", + "\n", + "\n", + "\n", + "# Configuration 4 update log file dlp -> df\n", + "conf = update_dft_configuration(dask_scheduler=dask_scheduler, verbose=True, \n", + " log_file=f\"./dft_{os.getenv('USER')}.log\", rebuild_index=False, time_approximate=False, \n", + " host_pattern=r'lassen(\\d+)', time_granularity=30e6, skip_hostname=True)\n", + "conf = get_dft_configuration()\n", + "\n", + "\n", + "# Setup\n", + "setup_logging()\n", + "setup_dask_cluster()\n", + "reset_dask_cluster()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO] [06:59:13] Created index for 0 files [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:370]\n", + "[INFO] [06:59:13] Total size of all files are bytes [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:372]\n", + "[INFO] [06:59:15] Loaded events [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:430]\n", + "[INFO] [06:59:15] Loaded plots with slope threshold: 45 [/usr/WS1/pandey2/DFtracer1/dftracer/dfanalyzer/main.py:436]\n" + ] + } + ], + "source": [ + "filename = \"test_data1.pfw\"\n", + "analyzer = DFAnalyzer(filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecatpidtidtstedurtintervaltrangehostnamecompute_timeio_timeapp_io_timetotal_timefilenamephasesizeid
0readPOSIX01906060[10,70]01d9bd41c01e6<NA>[10,70]<NA>[10,70]<NA>2<NA>0
1readPOSIX019304010[40,50]01d9bd41c01e6<NA>[40,50]<NA>[40,50]<NA>2<NA>1
2readPOSIX01911017060[120,180]01d9bd41c01e6<NA>[120,180]<NA>[120,180]<NA>2<NA>2
3readPOSIX01920024040[210,250]01d9bd41c01e6<NA>[210,250]<NA>[210,250]<NA>2<NA>3
4readPOSIX01922023010[230,240]01d9bd41c01e6<NA>[230,240]<NA>[230,240]<NA>2<NA>4
5readPOSIX01905050[10,60]01d9bd41c01e6<NA>[10,60]<NA>[10,60]/dlio/data12<NA>5
6readPOSIX019308050[40,90]01d9bd41c01e6<NA>[40,90]<NA>[40,90]<NA>2<NA>6
7FreadPOSIX01910018080[110,190]01d9bd41c01e6<NA>[110,190]<NA>[110,190]<NA>2<NA>7
8readPOSIX01913017545[140,185]01d9bd41c01e6<NA>[140,185]<NA>[140,185]/dlio/data1/train/2<NA>8
9readPOSIX01915017020[160,180]01d9bd41c01e6<NA>[160,180]<NA>[160,180]<NA>2<NA>9
10readPOSIX0191791834[189,193]01d9bd41c01e6<NA>[189,193]<NA>[189,193]<NA>2<NA>10
11readPOSIX0192102155[220,225]01d9bd41c01e6<NA>[220,225]<NA>[220,225]/dlio/data1/valid/2<NA>11
\n", + "
" + ], + "text/plain": [ + " name cat pid tid ts te dur tinterval trange hostname \\\n", + "0 read POSIX 0 19 0 60 60 [10,70] 0 1d9bd41c01e6 \n", + "1 read POSIX 0 19 30 40 10 [40,50] 0 1d9bd41c01e6 \n", + "2 read POSIX 0 19 110 170 60 [120,180] 0 1d9bd41c01e6 \n", + "3 read POSIX 0 19 200 240 40 [210,250] 0 1d9bd41c01e6 \n", + "4 read POSIX 0 19 220 230 10 [230,240] 0 1d9bd41c01e6 \n", + "5 read POSIX 0 19 0 50 50 [10,60] 0 1d9bd41c01e6 \n", + "6 read POSIX 0 19 30 80 50 [40,90] 0 1d9bd41c01e6 \n", + "7 Fread POSIX 0 19 100 180 80 [110,190] 0 1d9bd41c01e6 \n", + "8 read POSIX 0 19 130 175 45 [140,185] 0 1d9bd41c01e6 \n", + "9 read POSIX 0 19 150 170 20 [160,180] 0 1d9bd41c01e6 \n", + "10 read POSIX 0 19 179 183 4 [189,193] 0 1d9bd41c01e6 \n", + "11 read POSIX 0 19 210 215 5 [220,225] 0 1d9bd41c01e6 \n", + "\n", + " compute_time io_time app_io_time total_time filename phase \\\n", + "0 [10,70] [10,70] 2 \n", + "1 [40,50] [40,50] 2 \n", + "2 [120,180] [120,180] 2 \n", + "3 [210,250] [210,250] 2 \n", + "4 [230,240] [230,240] 2 \n", + "5 [10,60] [10,60] /dlio/data1 2 \n", + "6 [40,90] [40,90] 2 \n", + "7 [110,190] [110,190] 2 \n", + "8 [140,185] [140,185] /dlio/data1/train/ 2 \n", + "9 [160,180] [160,180] 2 \n", + "10 [189,193] [189,193] 2 \n", + "11 [220,225] [220,225] /dlio/data1/valid/ 2 \n", + "\n", + " size id \n", + "0 0 \n", + "1 1 \n", + "2 2 \n", + "3 3 \n", + "4 4 \n", + "5 5 \n", + "6 6 \n", + "7 7 \n", + "8 8 \n", + "9 9 \n", + "10 10 \n", + "11 11 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzer.events['id'] = analyzer.events.index\n", + "analyzer.events.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/usr/WS1/pandey2/DFtracer1/dftracer'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "app_root" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8e64b9b680674bd399eb68f945746e57", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "CytoscapeWidget(cytoscape_layout={'name': 'dagre'}, cytoscape_style=[{'selector': 'nodes', 'style': {'font-fam…" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dfanalyzer.graph_visualization.cytoscape import CytoGraph,GraphFunctions\n", + "graphfunction = GraphFunctions()\n", + "cytograph = CytoGraph()\n", + "graph = graphfunction.create_nx_graph(analyzer.events.compute())\n", + "graphfunction.visualize_graph(graph, cytograph)\n", + "# graphfunction.visualize_nxgraph(graph,cyto_obj=cytograph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test data visualization\n", + "Data designed for sanity check of the tool. Two different colors represent read/write request to two different mount points." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "Image(filename =r'data.png')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualization of traces as graph with ipycytoscape\n", + "* Nodes: Events\n", + "\n", + "* Edges: If two events have any overlapping time.\n", + "\n", + "* Degree: Count of overlapping events\n", + "\n", + "* Visualization filters: Nodes/Events with degree > 2 are \"Red\". Events with degree <= 2 are colored purple." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(filename =r'vis.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "envdft", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/graph_visualization/requirement.txt b/examples/graph_visualization/requirement.txt new file mode 100644 index 00000000..85649ea0 --- /dev/null +++ b/examples/graph_visualization/requirement.txt @@ -0,0 +1,3 @@ +ipycytoscape +networkx +ipywidgets \ No newline at end of file diff --git a/examples/graph_visualization/test_data1.pfw b/examples/graph_visualization/test_data1.pfw new file mode 100644 index 00000000..155a5898 --- /dev/null +++ b/examples/graph_visualization/test_data1.pfw @@ -0,0 +1,13 @@ +[ +{"id":"0","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"10","dur":"60","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"1","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"40","dur":"10","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"2","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"120","dur":"60","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"3","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"210","dur":"40","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"4","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"230","dur":"10","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"5","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"10","dur":"50","ph":"X","args":{"hostname":"1d9bd41c01e6","mode":511,"fname":"/dlio/data1"}} +{"id":"6","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"40","dur":"50","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"7","name":"Fread","cat":"POSIX","pid":"0","tid":"19","ts":"110","dur":"80","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"8","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"140","dur":"45","ph":"X","args":{"hostname":"1d9bd41c01e6","mode":511,"fname":"/dlio/data1/train/"}} +{"id":"9","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"160","dur":"20","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"10","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"189","dur":"4","ph":"X","args":{"hostname":"1d9bd41c01e6"}} +{"id":"11","name":"read","cat":"POSIX","pid":"0","tid":"19","ts":"220","dur":"5","ph":"X","args":{"hostname":"1d9bd41c01e6","mode":511,"fname":"/dlio/data1/valid/"}} diff --git a/examples/graph_visualization/vis.png b/examples/graph_visualization/vis.png new file mode 100644 index 00000000..77fa36e7 Binary files /dev/null and b/examples/graph_visualization/vis.png differ diff --git a/script/dftracer_compact.sh b/script/dftracer_compact.sh new file mode 100755 index 00000000..085a479d --- /dev/null +++ b/script/dftracer_compact.sh @@ -0,0 +1,130 @@ +#!/bin/bash +#!/bin/bash + +# The script compacts all trace file and then divides the trace into equal file pieces. +# This has the following signature. +# +# usage: dftracer_compact [-fcv] [-d input_directory] [-o output_directory] [-l num_lines] [-p prefix] +# -f override output directory +# -c compress output file +# -v enable verbose mode +# -h display help +# -d input_directory specify input directories. should contain .pfw or .pfw.gz files. +# -o output_directory specify output directory. +# -l num_lines lines per trace. +# -p prefix prefix to be used for compact files. + + +LOG_DIR=$PWD +OUTPUT_DIR=$PWD/output +LINES=10000 +PREFIX=app +override=0 +compressed=0 + +PPWD=$PWD + + +function usage { + echo "usage: $(basename $0) [-fcv] [-d input_directory] [-o output_directory] [-l num_lines] [-p prefix]" + echo " -f override output directory" + echo " -c compress output file" + echo " -v enable verbose mode" + echo " -h display help" + echo " -d input_directory specify input directories. should contain .pfw or .pfw.gz files." + echo " -o output_directory specify output directory." + echo " -l num_lines lines per trace." + echo " -p prefix prefix to be used for compact files." + exit 1 +} +while getopts ':cvfd:o:l:p:h' opt; do + case "$opt" in + d) + LOG_DIR="${OPTARG}" + ;; + o) + OUTPUT_DIR="${OPTARG}" + ;; + l) + LINES=${OPTARG} + ;; + p) + PREFIX="${OPTARG}" + ;; + f) + override=1 + ;; + v) + set -x + ;; + c) + compressed=1 + ;; + h) + usage + exit 0 + ;; + + :) + echo -e "option requires an argument.\n" + usage + exit 1 + ;; + + ?) + echo -e "Invalid command option.\n" + usage + exit 1 + ;; + esac +done +shift "$(($OPTIND -1))" + +mkdir -p ${OUTPUT_DIR} + +if [ -z "$( ls -A '${OUTPUT_DIR}' )" ] && [ $override -eq 0 ]; then + echo "The directory is not empty. Please pass a clean directory or pass -f flag." + exit 0 +fi + +echo "Setting up output directory" +rm -rf ${OUTPUT_DIR} +mkdir -p ${OUTPUT_DIR} + +pfw_count=`ls -1 $LOG_DIR/*.pfw 2> /dev/null | wc -l` +gz_count=`ls -1 $LOG_DIR/*.gz 2> /dev/null | wc -l` +total=$((pfw_count + gz_count)) +if [ $total == 0 ]; then + echo "The folder does not contain any pfw or pfw.gz files." + exit 0 +fi +dest=${OUTPUT_DIR}/temp +d2=${dest}.bak +shopt -s dotglob +if [[ "$pfw_count" != "0" ]]; then +echo "Parsing pfw files from ${LOG_DIR} folder" +ls ${LOG_DIR}/*.pfw | xargs cat | grep -v "^\[" | jq -c '.' > $d2 +fi + +if [[ "$gz_count" != "0" ]]; then +echo "Parsing pfw.gz files from ${LOG_DIR} folder" +gzip -c -d `echo $folder/*.gz` | grep -v "^\[" | jq -c '.' >> $d2 +fi + +cd ${OUTPUT_DIR} + +echo "Compacting all trace files with ${LINES} per files into ${OUTPUT_DIR} folder." +split -l ${LINES} --numeric-suffixes --additional-suffix=.pfw $d2 ${PREFIX}- +for file in *.pfw; do + echo "[" > $file.$$ + cat $file >> $file.$$ + mv $file.$$ $file +done +rm $d2 +if [ $compressed == 1 ]; then +gzip ${PREFIX}-*.pfw +fi + + +cd $PPWD + diff --git a/script/merge_pfw.sh b/script/merge_pfw.sh old mode 100644 new mode 100755 index 05e07780..473ab10e --- a/script/merge_pfw.sh +++ b/script/merge_pfw.sh @@ -1,12 +1,112 @@ #!/bin/bash -folder=$1 -dest=$2 +# This script allows users to combine all pfw format into one. +# This has the following signature. +# +# usage: merge_pfw.sh [-fcv] [-d input_directory] [-o OUTPUT_FILE] +# -f override output file +# -c compress output file +# -v enable verbose mode +# -h display help +# -d input_directory specify input directories. should contain .pfw or .pfw.gz files. +# -o output_file specify output file. should have extension .pfw + + +override=0 +folder=$PWD +compressed=0 +dest="combined.pfw" + +function usage { + echo "usage: $(basename $0) [-fcv] [-d input_directory] [-o OUTPUT_FILE]" + echo " -f override output file" + echo " -c compress output file" + echo " -v enable verbose mode" + echo " -h display help" + echo " -d input_directory specify input directories. should contain .pfw or .pfw.gz files." + echo " -o output_file specify output file. should have extension .pfw" + exit 1 +} +while getopts ':cvfd:o:h' opt; do + case "$opt" in + d) + folder="${OPTARG}" + ;; + + o) + dest="${OPTARG}" + if [[ $dest != *.pfw ]]; then + echo "output_file should have .pfw extension". + fi + ;; + + f) + override=1 + ;; + v) + set -x + ;; + c) + compressed=1 + ;; + h) + usage + exit 0 + ;; + + :) + echo -e "option requires an argument.\n" + usage + exit 1 + ;; + + ?) + echo -e "Invalid command option.\n" + usage + exit 1 + ;; + esac +done +shift "$(($OPTIND -1))" + +if [[ "$override" == "1" ]]; then +rm -rf $dest ${dest}.gz +fi + +pfw_count=`ls -1 $folder/*.pfw 2> /dev/null | wc -l` +gz_count=`ls -1 $folder/*.gz 2> /dev/null | wc -l` +total=$((pfw_count + gz_count)) +if [ $total == 0 ]; then + echo "The folder does not contain any pfw or pfw.gz files." + exit 0 +fi + + +if [ -f $dest ] && [ -f "$dest.gz" ] && [ "$override" -eq "0" ]; then + echo "The destination file exists. Please delete the file." + exit 0 +fi + + + d2=${dest}.bak shopt -s dotglob +if [[ "$pfw_count" != "0" ]]; then +echo "Parsing pfw files from ${folder} folder" cat `echo $folder/*.pfw` >> $d2 -gzip -c -d `echo $folder/*gz` >> $d2 -grep -i "[^#[]" $d2 > $dest +fi + +if [[ "$gz_count" != "0" ]]; then +echo "Parsing pfw.gz files from ${folder} folder" +gzip -c -d `echo $folder/*.gz` >> $d2 +fi + +echo "Extracting events" +grep -i "[^#[]" $d2 | jq -c > $dest printf '%s\n%s\n' "[" "$(cat ${dest})" > $dest +if [ $compressed == 1 ]; then +echo "Compressing events" gzip $dest +fi +echo "Created output file ${dest}.gz" rm $d2 \ No newline at end of file diff --git a/setup.py b/setup.py index 59168a32..40eaa709 100644 --- a/setup.py +++ b/setup.py @@ -88,6 +88,9 @@ def build_extension(self, ext: CMakeExtension) -> None: enable_dlio_tests = os.environ.get("DFTRACER_ENABLE_PAPER_TESTS", "OFF") cmake_args += [f"-DDFTRACER_ENABLE_PAPER_TESTS={enable_dlio_tests}"] + test_ld_library_path = os.environ.get("DFTRACER_TEST_LD_LIBRARY_PATH", "") + cmake_args += [f"-DDFTRACER_TEST_LD_LIBRARY_PATH={test_ld_library_path}"] + # CMake lets you override the generator - we need to check this. # Can be set with Conda-Build, for example. cmake_generator = os.environ.get("CMAKE_GENERATOR", "") @@ -151,7 +154,7 @@ def build_extension(self, ext: CMakeExtension) -> None: # logic and declaration, and simpler if you include description/version in a file. setup( name="pydftracer", - version="1.0.2", + version="1.0.3", description="I/O profiler for deep learning python apps. Specifically for dlio_benchmark.", long_description=long_description, long_description_content_type="text/markdown", @@ -192,9 +195,10 @@ def build_extension(self, ext: CMakeExtension) -> None: zip_safe=False, extras_require={"test": ["pytest>=6.0"], "dfanalyzer": [ + "seaborn>=0.13.2", "bokeh>=2.4.2", "pybind11", - "zindex_py==0.0.1", + "zindex_py==0.0.2", "pandas>=2.0.3", "dask>=2023.5.0", "distributed", diff --git a/src/dftracer/brahma/posix.cpp b/src/dftracer/brahma/posix.cpp index 391fc29c..9088eec2 100644 --- a/src/dftracer/brahma/posix.cpp +++ b/src/dftracer/brahma/posix.cpp @@ -195,6 +195,7 @@ int brahma::POSIXDFTracer::openat(int dirfd, const char *pathname, int flags, DFT_LOGGER_START(dirfd); DFT_LOGGER_UPDATE(dirfd); DFT_LOGGER_UPDATE(flags); + DFT_LOGGER_UPDATE(pathname); int ret = -1; if (flags & O_CREAT) { va_list args; diff --git a/src/dftracer/brahma/posix.h b/src/dftracer/brahma/posix.h index 2ebd054d..b4c9a259 100644 --- a/src/dftracer/brahma/posix.h +++ b/src/dftracer/brahma/posix.h @@ -79,7 +79,7 @@ class POSIXDFTracer : public POSIX { DFTRACER_LOGDEBUG("Finalizing POSIXDFTracer", ""); stop_trace = true; } - ~POSIXDFTracer() { DFTRACER_LOGDEBUG("Destructing POSIXDFTracer", ""); } + ~POSIXDFTracer() {} static std::shared_ptr get_instance(bool trace_all = false) { DFTRACER_LOGDEBUG("POSIX class get_instance", ""); if (!stop_trace && instance == nullptr) { diff --git a/src/dftracer/brahma/stdio.cpp b/src/dftracer/brahma/stdio.cpp index 164d472c..75966645 100644 --- a/src/dftracer/brahma/stdio.cpp +++ b/src/dftracer/brahma/stdio.cpp @@ -47,6 +47,7 @@ size_t brahma::STDIODFTracer::fread(void *ptr, size_t size, size_t nmemb, DFT_LOGGER_UPDATE(size); DFT_LOGGER_UPDATE(nmemb); size_t ret = __real_fread(ptr, size, nmemb, fp); + DFT_LOGGER_UPDATE(ret); DFT_LOGGER_END(); return ret; } @@ -58,6 +59,7 @@ size_t brahma::STDIODFTracer::fwrite(const void *ptr, size_t size, size_t nmemb, DFT_LOGGER_UPDATE(size); DFT_LOGGER_UPDATE(nmemb); size_t ret = __real_fwrite(ptr, size, nmemb, fp); + DFT_LOGGER_UPDATE(ret); DFT_LOGGER_END(); return ret; } @@ -66,6 +68,7 @@ long brahma::STDIODFTracer::ftell(FILE *fp) { BRAHMA_MAP_OR_FAIL(ftell); DFT_LOGGER_START(fp); long ret = __real_ftell(fp); + DFT_LOGGER_UPDATE(ret); DFT_LOGGER_END(); return ret; } @@ -76,6 +79,7 @@ int brahma::STDIODFTracer::fseek(FILE *fp, long offset, int whence) { DFT_LOGGER_UPDATE(offset); DFT_LOGGER_UPDATE(whence); int ret = __real_fseek(fp, offset, whence); + DFT_LOGGER_UPDATE(ret); DFT_LOGGER_END(); return ret; } \ No newline at end of file diff --git a/src/dftracer/brahma/stdio.h b/src/dftracer/brahma/stdio.h index 63dcb24d..17135b5e 100644 --- a/src/dftracer/brahma/stdio.h +++ b/src/dftracer/brahma/stdio.h @@ -66,7 +66,7 @@ class STDIODFTracer : public STDIO { DFTRACER_LOGDEBUG("Finalizing STDIODFTracer", ""); stop_trace = true; } - ~STDIODFTracer() { DFTRACER_LOGDEBUG("Destructing STDIODFTracer", ""); }; + ~STDIODFTracer() {}; static std::shared_ptr get_instance(bool trace_all = false) { DFTRACER_LOGDEBUG("STDIO class get_instance", ""); diff --git a/src/dftracer/core/dftracer_main.cpp b/src/dftracer/core/dftracer_main.cpp index a85ac9a4..2b5cf20c 100644 --- a/src/dftracer/core/dftracer_main.cpp +++ b/src/dftracer/core/dftracer_main.cpp @@ -137,34 +137,43 @@ void dftracer::DFTracerCore::initialize(bool _bind, const char *_log_file, this->process_id = *_process_id; } DFTRACER_LOGDEBUG("Setting process_id to %d", this->process_id); - if (_log_file == nullptr) { - char cmd[128]; - sprintf(cmd, "/proc/%lu/cmdline", df_getpid()); - int fd = df_open(cmd, O_RDONLY); - std::string exec_name = "DEFAULT"; - - if (fd != -1) { - char exec_file_name[DFT_PATH_MAX]; - ssize_t read_bytes = df_read(fd, exec_file_name, DFT_PATH_MAX); - df_close(fd); - ssize_t index = 0; - while (index < read_bytes - 1) { - if (exec_file_name[index] == '\0') { - exec_file_name[index] = SEPARATOR; + char exec_name[DFT_PATH_MAX] = "DEFAULT"; + char exec_cmd[DFT_PATH_MAX] = "DEFAULT"; + char cmd[128]; + sprintf(cmd, "/proc/%lu/cmdline", df_getpid()); + int fd = df_open(cmd, O_RDONLY); + if (fd != -1) { + ssize_t read_bytes = df_read(fd, exec_cmd, DFT_PATH_MAX); + df_close(fd); + ssize_t index = 0; + size_t parts = 0; + size_t last_index = 0; + bool has_extracted = false; + while (index < read_bytes - 1 && index < DFT_PATH_MAX - 2) { + if (exec_cmd[index] == '\0') { + if (!has_extracted) { + strcpy(exec_name, basename(exec_cmd + last_index)); + if (strcmp(exec_name, "python") != 0) { + has_extracted = true; + } + DFTRACER_LOGINFO("Extracted process_name %s", exec_name); } - index++; + exec_cmd[index] = SEPARATOR; + last_index = index + 1; + parts++; } - DFTRACER_LOGDEBUG("Exec command line %s", exec_file_name); - auto items = split(exec_file_name, SEPARATOR); - for (const auto &item : items) { - if (strstr(item.c_str(), "python") == nullptr) { - exec_name = basename(item.c_str()); - break; - } + if (parts > 1) { + exec_cmd[index] = '\0'; } + index++; } - DFTRACER_LOGINFO("Extracted process_name %s", exec_name.c_str()); + exec_cmd[DFT_PATH_MAX - 1] = '\0'; + DFTRACER_LOGDEBUG("Exec command line %s", exec_cmd); + } + if (_log_file == nullptr) { + DFTRACER_LOGINFO("Extracted process_name %s", exec_name); if (!conf->log_file.empty()) { + DFTRACER_LOGDEBUG("Conf has log file %s", conf->log_file.c_str()); this->log_file = std::string(conf->log_file) + "-" + exec_name + "-" + std::to_string(this->process_id) + "-" + log_file_suffix + ".pfw"; @@ -176,11 +185,12 @@ void dftracer::DFTracerCore::initialize(bool _bind, const char *_log_file, this->log_file = _log_file; } DFTRACER_LOGDEBUG("Setting log file to %s", this->log_file.c_str()); - logger->update_log_file(this->log_file, this->process_id); + logger->update_log_file(this->log_file, exec_name, exec_cmd, + this->process_id); if (bind) { if (conf->io) { auto trie = dftracer::Singleton::get_instance(); - const char *ignore_extensions[3] = {".pfw", ".py",".pfw.gz"}; + const char *ignore_extensions[3] = {".pfw", ".py", ".pfw.gz"}; const char *ignore_prefix[8] = {"/pipe", "/socket", "/proc", "/sys", "/collab", "anon_inode", "socket", "/var/tmp"}; diff --git a/src/dftracer/df_logger.h b/src/dftracer/df_logger.h index 352ef3b8..86baed87 100644 --- a/src/dftracer/df_logger.h +++ b/src/dftracer/df_logger.h @@ -10,13 +10,16 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include #include typedef std::chrono::high_resolution_clock chrono; @@ -54,12 +57,26 @@ class DFTLogger { index_stack.clear(); DFTRACER_LOGDEBUG("Destructing DFTLogger", ""); } - inline void update_log_file(std::string log_file, ProcessID process_id = -1) { + inline void update_log_file(std::string log_file, std::string exec_name, + std::string cmd, ProcessID process_id = -1) { DFTRACER_LOGDEBUG("DFTLogger.update_log_file %s", log_file.c_str()); this->process_id = process_id; this->writer = dftracer::Singleton::get_instance(); if (this->writer != nullptr) { this->writer->initialize(log_file.data(), this->throw_error); + auto meta = std::unordered_map(); + meta.insert_or_assign("version", DFTRACER_VERSION); + meta.insert_or_assign("exec", exec_name); + meta.insert_or_assign("cmd", cmd); + time_t ltime; /* calendar time */ + ltime = time(NULL); /* get current cal time */ + char timestamp[1024]; + auto size = sprintf(timestamp, "%s", asctime(localtime(<ime))); + timestamp[size - 1] = '\0'; + meta.insert_or_assign("date", std::string(timestamp)); + this->enter_event(); + this->log("start", "dftracer", this->get_time(), 0, &meta); + this->exit_event(); } this->is_init = true; DFTRACER_LOGINFO("Writing trace to %s", log_file.c_str()); @@ -112,6 +129,11 @@ class DFTLogger { inline void finalize() { DFTRACER_LOGDEBUG("DFTLogger.finalize", ""); if (this->writer != nullptr) { + auto meta = std::unordered_map(); + meta.insert_or_assign("num_events", index.load()); + this->enter_event(); + this->log("end", "dftracer", this->get_time(), 0, &meta); + this->exit_event(); writer->finalize(has_entry); DFTRACER_LOGINFO("Released Logger", ""); } else { diff --git a/src/dftracer/utils/utils.h b/src/dftracer/utils/utils.h index 2945183d..6b721fab 100644 --- a/src/dftracer/utils/utils.h +++ b/src/dftracer/utils/utils.h @@ -40,7 +40,9 @@ inline void signal_handler(int sig) { // GCOVR_EXCL_START nptrs = backtrace(buffer, STACK_SIZE); strings = backtrace_symbols(buffer, nptrs); if (strings != NULL) { - for (j = 0; j < nptrs; j++) printf("%s\n", strings[j]); + for (j = 0; j < nptrs; j++) { + DFTRACER_LOGERROR("%s", strings[j]); + } free(strings); } exit(0); diff --git a/src/dftracer/writer/chrome_writer.cpp b/src/dftracer/writer/chrome_writer.cpp index c5a8f4d2..d381d865 100644 --- a/src/dftracer/writer/chrome_writer.cpp +++ b/src/dftracer/writer/chrome_writer.cpp @@ -8,8 +8,10 @@ #include #include +#include #include #include +#include #include #include @@ -41,6 +43,7 @@ void dftracer::ChromeWriter::log( std::unordered_map *metadata, ProcessID process_id, ThreadID thread_id) { DFTRACER_LOGDEBUG("ChromeWriter.log", ""); + if (fh != nullptr) { int size; char data[MAX_LINE_SIZE]; diff --git a/src/dftracer/writer/chrome_writer.h b/src/dftracer/writer/chrome_writer.h index 7bfeb6c0..208fb4cf 100644 --- a/src/dftracer/writer/chrome_writer.h +++ b/src/dftracer/writer/chrome_writer.h @@ -62,9 +62,10 @@ class ChromeWriter { auto written_elements = fwrite(write_buffer, sizeof(char), write_size, fh); funlockfile(fh); if (written_elements != write_size) { // GCOVR_EXCL_START - ERROR(written_elements != write_size, - "unable to log write %s for a+ written only %d of %d with error %s", - filename.c_str(), written_elements, write_size, strerror(errno)); + ERROR( + written_elements != write_size, + "unable to log write for a+ written only %d of %d with error code %d", + written_elements, write_size, errno); } // GCOVR_EXCL_STOP return written_elements; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c41d3a9c..5f4d0a61 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,7 +22,7 @@ add_dependencies(test_c ${PROJECT_NAME}_preload) function(set_common_properties test_name) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_LOG_LEVEL=DEBUG) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_TRACE_COMPRESSION=0) - set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/${DFTRACER_LIBDIR}) + set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/${DFTRACER_LIBDIR}:${DFTRACER_TEST_LD_LIBRARY_PATH}) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_DATA_DIR=${CMAKE_CURRENT_BINARY_DIR}/data) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_LOG_FILE=${CMAKE_CURRENT_BINARY_DIR}/${test_name}) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_ENABLE=1) @@ -125,7 +125,7 @@ set(test_name test_py_both) df_add_test(${test_name} ${DFTRACER_PYTHON_EXE} ${CMAKE_CURRENT_SOURCE_DIR}/py/test.py --format=npz --data_dir=${CMAKE_CURRENT_BINARY_DIR}/data) set_common_properties(${test_name}) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT PYTHONPATH=$ENV{PYTHONPATH}:${CMAKE_SOURCE_DIR}/venv/${DFTRACER_LIBDIR}) -set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/${DFTRACER_LIBDIR}:${CMAKE_SOURCE_DIR}/dependency/.spack-env/view/lib64) +set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/${DFTRACER_LIBDIR}:${CMAKE_SOURCE_DIR}/dependency/.spack-env/view/lib64:${DFTRACER_TEST_LD_LIBRARY_PATH}) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_LOG_FILE=${CMAKE_CURRENT_BINARY_DIR}/${test_name}_app) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT LD_PRELOAD=${CMAKE_BINARY_DIR}/${DFTRACER_LIBDIR}/libdftracer_preload.so) set_property(TEST ${test_name} APPEND PROPERTY ENVIRONMENT DFTRACER_DATA_DIR=${CMAKE_BINARY_DIR}) diff --git a/test/paper/load_darshan.py b/test/paper/load_darshan.py index ac4e7da5..098b04c3 100644 --- a/test/paper/load_darshan.py +++ b/test/paper/load_darshan.py @@ -64,6 +64,13 @@ def get_dict(row): args = parser.parse_args() filename = args.trace_file +cluster = LocalCluster(n_workers=args.workers) # Launches a scheduler and workers locally +client = Client(cluster) # Connect to distributed cluster and override default + +args = parser.parse_args() +filename = args.trace_file + + file_pattern = glob(filename) all_records = [] diff --git a/test/py/test.py b/test/py/test.py index 4e237a31..3932b05c 100644 --- a/test/py/test.py +++ b/test/py/test.py @@ -141,6 +141,11 @@ def init(): """This function is called when new processes start.""" print(f"Initializing process {os.getpid()}") +@dft_fn.log +def with_default_args(step=2): + for i in dft_fn.iter(range(step)): + print(i) + def main(): posix_calls((20, False)) @@ -174,6 +179,8 @@ def main(): for n in range(args.niter): read_data(num_files=args.num_files, data_dir=args.data_dir, format=args.format) + with_default_args() + log_inst.finalize()