diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 3482cc463b..e928fafd7f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -72,10 +72,11 @@ jobs: pr_info: ${{ steps.get-pr-info.outcome == 'success' && steps.get-pr-info.outputs.pr-info || '' }} checks: + needs: [prepare] + # Only run the CI pipeline if the PR does not have the skip-ci label and we are on a PR branch + if: ${{ !fromJSON(needs.prepare.outputs.has_skip_ci_label) && fromJSON(needs.prepare.outputs.is_pr )}} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02 - # Only run the CI pipeline if the PR does not have the skip-ci label - if: ${{ ! fromJSON(needs.prepare.outputs.has_skip_ci_label) }} with: enable_check_generated_files: false diff --git a/CHANGELOG.md b/CHANGELOG.md index a289af5f0a..d00283cc18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,93 @@ See the License for the specific language governing permissions and limitations under the License. --> +# Morpheus 24.03.00 (7 Apr 2024) + +## 🚨 Breaking Changes + +- Updating `nlohman_json` to 3.11 to match MRC ([#1596](https://github.com/nv-morpheus/Morpheus/pull/1596)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Add retry logic and proxy support to the NeMo LLM Service ([#1544](https://github.com/nv-morpheus/Morpheus/pull/1544)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Upgrade `openai` version to 1.13 and `langchain` to version 0.1.9 ([#1529](https://github.com/nv-morpheus/Morpheus/pull/1529)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Make `start_async()` available to source stages ([#1523](https://github.com/nv-morpheus/Morpheus/pull/1523)) [@efajardo-nv](https://github.com/efajardo-nv) +- RAPIDS 24.02 Upgrade ([#1468](https://github.com/nv-morpheus/Morpheus/pull/1468)) [@cwharris](https://github.com/cwharris) +- Decouple TritonInferenceStage from pipeline mode ([#1402](https://github.com/nv-morpheus/Morpheus/pull/1402)) [@dagardner-nv](https://github.com/dagardner-nv) + +## 🐛 Bug Fixes + +- Serialize datetime objects into the module config ([#1592](https://github.com/nv-morpheus/Morpheus/pull/1592)) [@dagardner-nv](https://github.com/dagardner-nv) +- Remove the defaults channel from `dependencies.yml` ([#1584](https://github.com/nv-morpheus/Morpheus/pull/1584)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Fix `iso_date_regex_pattern` config in `file_batcher` module and allow override ([#1580](https://github.com/nv-morpheus/Morpheus/pull/1580)) [@efajardo-nv](https://github.com/efajardo-nv) +- Update DFP MLflow ModelManager to handle model retrieval using file URI ([#1578](https://github.com/nv-morpheus/Morpheus/pull/1578)) [@efajardo-nv](https://github.com/efajardo-nv) +- Fix `configure_logging` in DFP benchmarks ([#1553](https://github.com/nv-morpheus/Morpheus/pull/1553)) [@efajardo-nv](https://github.com/efajardo-nv) +- Catch langchain agent errors ([#1539](https://github.com/nv-morpheus/Morpheus/pull/1539)) [@dagardner-nv](https://github.com/dagardner-nv) +- Adding missing dependency on `pydantic` ([#1535](https://github.com/nv-morpheus/Morpheus/pull/1535)) [@yuchenz427](https://github.com/yuchenz427) +- Fix memory leak in the mutable dataframe checkout/checkin code ([#1534](https://github.com/nv-morpheus/Morpheus/pull/1534)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix pathlib.Path support for FileSourceStage ([#1531](https://github.com/nv-morpheus/Morpheus/pull/1531)) [@dagardner-nv](https://github.com/dagardner-nv) +- Make `start_async()` available to source stages ([#1523](https://github.com/nv-morpheus/Morpheus/pull/1523)) [@efajardo-nv](https://github.com/efajardo-nv) +- Update CI Containers ([#1521](https://github.com/nv-morpheus/Morpheus/pull/1521)) [@cwharris](https://github.com/cwharris) +- Fix intermittent segfault on interpreter shutdown ([#1513](https://github.com/nv-morpheus/Morpheus/pull/1513)) [@dagardner-nv](https://github.com/dagardner-nv) +- Adopt updated builds of CI runners ([#1503](https://github.com/nv-morpheus/Morpheus/pull/1503)) [@dagardner-nv](https://github.com/dagardner-nv) +- Update mlflow plugin version for deployments fix ([#1499](https://github.com/nv-morpheus/Morpheus/pull/1499)) [@pdmack](https://github.com/pdmack) +- Add runtime environment output to fix building the release container ([#1496](https://github.com/nv-morpheus/Morpheus/pull/1496)) [@cwharris](https://github.com/cwharris) +- Fix logging of sleep time ([#1493](https://github.com/nv-morpheus/Morpheus/pull/1493)) [@dagardner-nv](https://github.com/dagardner-nv) +- Pin pytest to <8 ([#1485](https://github.com/nv-morpheus/Morpheus/pull/1485)) [@dagardner-nv](https://github.com/dagardner-nv) +- Improve pipeline stop logic to ensure join is called exactly once for all stages ([#1479](https://github.com/nv-morpheus/Morpheus/pull/1479)) [@efajardo-nv](https://github.com/efajardo-nv) +- Fix expected JSON config file extension in logger ([#1471](https://github.com/nv-morpheus/Morpheus/pull/1471)) [@efajardo-nv](https://github.com/efajardo-nv) +- Fix Loss Function to Improve Model Convergence for `AutoEncoder` ([#1460](https://github.com/nv-morpheus/Morpheus/pull/1460)) [@hsin-c](https://github.com/hsin-c) +- GNN fraud detection notebook fix ([#1450](https://github.com/nv-morpheus/Morpheus/pull/1450)) [@efajardo-nv](https://github.com/efajardo-nv) +- Eliminate Redundant Fetches in RSS Controller ([#1442](https://github.com/nv-morpheus/Morpheus/pull/1442)) [@bsuryadevara](https://github.com/bsuryadevara) +- Updating the workspace settings to remove deprecated python options ([#1440](https://github.com/nv-morpheus/Morpheus/pull/1440)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Improve camouflage startup issues ([#1436](https://github.com/nv-morpheus/Morpheus/pull/1436)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fixes to modular DFP examples and benchmarks ([#1429](https://github.com/nv-morpheus/Morpheus/pull/1429)) [@efajardo-nv](https://github.com/efajardo-nv) + +## 📖 Documentation + +- Update minimum compute requirements to Volta ([#1594](https://github.com/nv-morpheus/Morpheus/pull/1594)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix broken link in getting started with Morpheus doc ([#1494](https://github.com/nv-morpheus/Morpheus/pull/1494)) [@edknv](https://github.com/edknv) +- Update abp-model-card.md ([#1439](https://github.com/nv-morpheus/Morpheus/pull/1439)) [@drobison00](https://github.com/drobison00) +- Update gnn-fsi-model-card.md ([#1438](https://github.com/nv-morpheus/Morpheus/pull/1438)) [@drobison00](https://github.com/drobison00) +- Update phishing-model-card.md ([#1437](https://github.com/nv-morpheus/Morpheus/pull/1437)) [@drobison00](https://github.com/drobison00) +- Document incompatible mlflow models issue ([#1434](https://github.com/nv-morpheus/Morpheus/pull/1434)) [@dagardner-nv](https://github.com/dagardner-nv) + +## 🚀 New Features + +- Adding retry logic to the `TritonInferenceStage` to allow recovering from errors ([#1548](https://github.com/nv-morpheus/Morpheus/pull/1548)) [@cwharris](https://github.com/cwharris) +- Create a base mixin class for ingress & egress stages ([#1473](https://github.com/nv-morpheus/Morpheus/pull/1473)) [@dagardner-nv](https://github.com/dagardner-nv) +- RAPIDS 24.02 Upgrade ([#1468](https://github.com/nv-morpheus/Morpheus/pull/1468)) [@cwharris](https://github.com/cwharris) +- Install headers & morpheus-config.cmake ([#1448](https://github.com/nv-morpheus/Morpheus/pull/1448)) [@dagardner-nv](https://github.com/dagardner-nv) + +## 🛠️ Improvements + +- Updating `nlohman_json` to 3.11 to match MRC ([#1596](https://github.com/nv-morpheus/Morpheus/pull/1596)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- DOCA 2.6 from public repo ([#1588](https://github.com/nv-morpheus/Morpheus/pull/1588)) [@e-ago](https://github.com/e-ago) +- Support `ControlMessage` for `PreProcessNLPStage` `PreProcessFILStage` `AddScoreStageBase` ([#1573](https://github.com/nv-morpheus/Morpheus/pull/1573)) [@yuchenz427](https://github.com/yuchenz427) +- Update MLflow in Production DFP example to use Python 3.10 ([#1572](https://github.com/nv-morpheus/Morpheus/pull/1572)) [@efajardo-nv](https://github.com/efajardo-nv) +- Fix environment yaml paths ([#1551](https://github.com/nv-morpheus/Morpheus/pull/1551)) [@efajardo-nv](https://github.com/efajardo-nv) +- Add retry logic and proxy support to the NeMo LLM Service ([#1544](https://github.com/nv-morpheus/Morpheus/pull/1544)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Update to match new MRC function sig for AsyncioRunnable::on_data ([#1541](https://github.com/nv-morpheus/Morpheus/pull/1541)) [@dagardner-nv](https://github.com/dagardner-nv) +- Expose max_retries parameter to OpenAIChatService & OpenAIChatClient ([#1536](https://github.com/nv-morpheus/Morpheus/pull/1536)) [@dagardner-nv](https://github.com/dagardner-nv) +- Upgrade `openai` version to 1.13 and `langchain` to version 0.1.9 ([#1529](https://github.com/nv-morpheus/Morpheus/pull/1529)) [@mdemoret-nv](https://github.com/mdemoret-nv) +- Update ops-bot.yaml ([#1528](https://github.com/nv-morpheus/Morpheus/pull/1528)) [@AyodeAwe](https://github.com/AyodeAwe) +- Add the ability to attach Tensor objects and timestamps to `ControlMessage` ([#1511](https://github.com/nv-morpheus/Morpheus/pull/1511)) [@drobison00](https://github.com/drobison00) +- Fix or silence warnings emitted during tests ([#1501](https://github.com/nv-morpheus/Morpheus/pull/1501)) [@dagardner-nv](https://github.com/dagardner-nv) +- Support ControlMessage output in the C++ impl of DeserializeStage ([#1478](https://github.com/nv-morpheus/Morpheus/pull/1478)) [@dagardner-nv](https://github.com/dagardner-nv) +- DOCA Source Stage improvements ([#1475](https://github.com/nv-morpheus/Morpheus/pull/1475)) [@e-ago](https://github.com/e-ago) +- Update copyright headers for 2024 ([#1474](https://github.com/nv-morpheus/Morpheus/pull/1474)) [@efajardo-nv](https://github.com/efajardo-nv) +- Add conda builds to CI ([#1466](https://github.com/nv-morpheus/Morpheus/pull/1466)) [@dagardner-nv](https://github.com/dagardner-nv) +- Grafana log monitoring and error alerting example ([#1463](https://github.com/nv-morpheus/Morpheus/pull/1463)) [@efajardo-nv](https://github.com/efajardo-nv) +- Misc Conda Improvements ([#1462](https://github.com/nv-morpheus/Morpheus/pull/1462)) [@dagardner-nv](https://github.com/dagardner-nv) +- Simplification of the streaming RAG ingest example to improve usability ([#1454](https://github.com/nv-morpheus/Morpheus/pull/1454)) [@drobison00](https://github.com/drobison00) +- Replace GPUtil with pynvml for benchmark reports ([#1451](https://github.com/nv-morpheus/Morpheus/pull/1451)) [@efajardo-nv](https://github.com/efajardo-nv) +- Misc test improvements ([#1447](https://github.com/nv-morpheus/Morpheus/pull/1447)) [@dagardner-nv](https://github.com/dagardner-nv) +- Add a --manual_seed flag to the CLI ([#1445](https://github.com/nv-morpheus/Morpheus/pull/1445)) [@dagardner-nv](https://github.com/dagardner-nv) +- Optionally skip ci based on a label in the pr ([#1444](https://github.com/nv-morpheus/Morpheus/pull/1444)) [@dagardner-nv](https://github.com/dagardner-nv) +- Refactor verification of optional dependencies ([#1443](https://github.com/nv-morpheus/Morpheus/pull/1443)) [@dagardner-nv](https://github.com/dagardner-nv) +- Use dependencies.yaml as source-of-truth for environment files. ([#1441](https://github.com/nv-morpheus/Morpheus/pull/1441)) [@cwharris](https://github.com/cwharris) +- Add mocked test & benchmark for LLM agents pipeline ([#1424](https://github.com/nv-morpheus/Morpheus/pull/1424)) [@dagardner-nv](https://github.com/dagardner-nv) +- Add benchmarks for stand-alone RAG & vdb upload pipelines ([#1421](https://github.com/nv-morpheus/Morpheus/pull/1421)) [@dagardner-nv](https://github.com/dagardner-nv) +- Add benchmark for completion pipeline ([#1414](https://github.com/nv-morpheus/Morpheus/pull/1414)) [@dagardner-nv](https://github.com/dagardner-nv) +- Decouple TritonInferenceStage from pipeline mode ([#1402](https://github.com/nv-morpheus/Morpheus/pull/1402)) [@dagardner-nv](https://github.com/dagardner-nv) + # Morpheus 23.11.01 (7 Dec 2023) ## 🐛 Bug Fixes diff --git a/ci/iwyu/mappings.imp b/ci/iwyu/mappings.imp index a8d955dbe9..a087b65fbe 100644 --- a/ci/iwyu/mappings.imp +++ b/ci/iwyu/mappings.imp @@ -52,41 +52,10 @@ # rxcpp # Hide includes that are exported by -{ "include": [ "\"rx-includes.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-util.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-predef.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-subscription.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-observer.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-scheduler.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-subscriber.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-notification.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-coordination.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-sources.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-subjects.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-operators.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-observable.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-connectable_observable.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, -{ "include": [ "\"rxcpp/rx-grouped_observable.hpp\"", private, "", "public" ] }, -{ "include": [ "", private, "", "public" ] }, +{ "include": ["@", "private", "", "public" ] }, +{ "include": ["@\"rxcpp/.*\"", "private", "", "public" ] }, +{ "include": ["@", "private", "", "public" ] }, +{ "include": ["@\"rxcpp/rx-.*\"", "private", "", "public" ] }, #Triton Client { "include": ["\"common.h\"", "private", "", "public"] }, diff --git a/cmake/package_config/bsd/Configure_bsd.cmake b/cmake/package_config/bsd/Configure_bsd.cmake index e7af920dd9..1a68e006b7 100644 --- a/cmake/package_config/bsd/Configure_bsd.cmake +++ b/cmake/package_config/bsd/Configure_bsd.cmake @@ -32,7 +32,7 @@ function(morpheus_configure_libbsd) if (bsd_ADDED) message(STATUS "libbsd was not installed and will be built from source") - find_package(bsd REQUIRED) + find_package(md REQUIRED) set(bsd_INSTALL_DIR ${bsd_BINARY_DIR}/install) file(MAKE_DIRECTORY ${bsd_INSTALL_DIR}/include) @@ -46,8 +46,6 @@ function(morpheus_configure_libbsd) cmake_path(GET MD_LIBRARY PARENT_PATH MD_LINK_DIRECTORY) - message(STATUS "MD_LIBRARY: ${MD_LINK_DIRECTORY}") - # Get the Compiler settings to forward onto autoconf set(COMPILER_SETTINGS "CXX=${CMAKE_CXX_COMPILER_LAUNCHER} ${CMAKE_CXX_COMPILER}" @@ -87,7 +85,7 @@ function(morpheus_configure_libbsd) # Install only the headers install( - DIRECTORY ${md_INSTALL_DIR}/include + DIRECTORY ${bsd_INSTALL_DIR}/include TYPE INCLUDE ) @@ -108,8 +106,6 @@ function(morpheus_configure_libbsd) add_dependencies(bsd::bsd bsd) - message(STATUS "bsd_INSTALL_DIR: ${bsd_INSTALL_DIR}") - endif() LIST(POP_BACK CMAKE_MESSAGE_CONTEXT) diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml index 191929a961..f75b2bbecd 100644 --- a/conda/environments/all_cuda-121_arch-x86_64.yaml +++ b/conda/environments/all_cuda-121_arch-x86_64.yaml @@ -117,6 +117,7 @@ dependencies: - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - PyMuPDF==1.23.21 + - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 - dglgo diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml index 2ef115c7a7..3c37f5af7f 100644 --- a/conda/environments/dev_cuda-121_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-121_arch-x86_64.yaml @@ -10,6 +10,7 @@ channels: - pytorch dependencies: - appdirs +- automake - benchmark=1.8.3 - boost-cpp=1.84 - breathe=4.35.0 @@ -43,6 +44,7 @@ dependencies: - ipython - isort - librdkafka>=1.9.2,<1.10.0a0 +- libtool - mlflow=2.9.2 - mrc=24.06 - myst-parser=0.18.1 @@ -72,6 +74,7 @@ dependencies: - pytorch-cuda - pytorch=*=*cuda* - rapidjson=1.1.0 +- rdma-core>=48 - requests - requests-cache=1.1 - requests-toolbelt @@ -92,6 +95,7 @@ dependencies: - zlib=1.2.13 - pip: - PyMuPDF==1.23.21 + - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 - pymilvus==2.3.6 diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml index 2b663d6b1a..e2e32c67a1 100644 --- a/conda/environments/examples_cuda-121_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-121_arch-x86_64.yaml @@ -61,6 +61,7 @@ dependencies: - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - PyMuPDF==1.23.21 + - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 - dglgo diff --git a/conda/environments/runtime_cuda-121_arch-x86_64.yaml b/conda/environments/runtime_cuda-121_arch-x86_64.yaml index 791f58e463..2c5a21bdf3 100644 --- a/conda/environments/runtime_cuda-121_arch-x86_64.yaml +++ b/conda/environments/runtime_cuda-121_arch-x86_64.yaml @@ -37,6 +37,7 @@ dependencies: - watchdog=3.0 - websockets - pip: + - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 - pymilvus==2.3.6 diff --git a/dependencies.yaml b/dependencies.yaml index 770581e141..516e918910 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -52,6 +52,7 @@ files: - cudatoolkit - data_retrieval - development + - doca - docs - python - runtime @@ -270,6 +271,7 @@ dependencies: - websockets - pip - pip: + - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 # update to match pymilvus when available - pymilvus==2.3.6 diff --git a/docker/Dockerfile b/docker/Dockerfile index ff2caaf455..492e4becae 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -149,9 +149,6 @@ FROM conda_env as base_extended # Add one or more optional dependencies to the base environment ARG MORPHEUS_ROOT_HOST ARG MORPHEUS_SUPPORT_DOCA="FALSE" -ARG DOCA_REPO_HOST -ARG DOCA_VERSION=2.6.0-0.0.1 - # Set this environment variable so it auto builds DOCA ENV MORPHEUS_SUPPORT_DOCA=${MORPHEUS_SUPPORT_DOCA} diff --git a/docker/build_container.sh b/docker/build_container.sh index f908a1b2db..36c2f7084d 100755 --- a/docker/build_container.sh +++ b/docker/build_container.sh @@ -31,7 +31,6 @@ DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} CUDA_MAJOR_VER=${CUDA_MAJOR_VER:-12} CUDA_MINOR_VER=${CUDA_MINOR_VER:-1} CUDA_REV_VER=${CUDA_REV_VER:-1} -DOCA_REPO_HOST=${DOCA_REPO_HOST:-""} FROM_IMAGE=${FROM_IMAGE:-"nvidia/cuda"} LINUX_DISTRO=${LINUX_DISTRO:-ubuntu} LINUX_VER=${LINUX_VER:-22.04} @@ -47,7 +46,6 @@ DOCKER_ARGS="${DOCKER_ARGS} --target ${DOCKER_TARGET}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg CUDA_MAJOR_VER=${CUDA_MAJOR_VER}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg CUDA_MINOR_VER=${CUDA_MINOR_VER}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg CUDA_REV_VER=${CUDA_REV_VER}" -DOCKER_ARGS="${DOCKER_ARGS} --build-arg DOCA_REPO_HOST=${DOCA_REPO_HOST}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg FROM_IMAGE=${FROM_IMAGE}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg LINUX_DISTRO=${LINUX_DISTRO}" DOCKER_ARGS="${DOCKER_ARGS} --build-arg LINUX_VER=${LINUX_VER}" @@ -66,7 +64,6 @@ echo "Building morpheus:${DOCKER_TAG} with args..." echo " CUDA_MAJOR_VER : ${CUDA_MAJOR_VER}" echo " CUDA_MINOR_VER : ${CUDA_MINOR_VER}" echo " CUDA_REV_VER : ${CUDA_REV_VER}" -echo " DOCA_REPO_HOST : ${DOCA_REPO_HOST}" echo " FROM_IMAGE : ${FROM_IMAGE}" echo " LINUX_DISTRO : ${LINUX_DISTRO}" echo " LINUX_VER : ${LINUX_VER}" diff --git a/docker/optional_deps/doca.sh b/docker/optional_deps/doca.sh index 8351b79db4..97d1d108f3 100755 --- a/docker/optional_deps/doca.sh +++ b/docker/optional_deps/doca.sh @@ -17,14 +17,15 @@ set -e MORPHEUS_SUPPORT_DOCA=${MORPHEUS_SUPPORT_DOCA:-OFF} +LINUX_DISTRO=${LINUX_DISTRO:-ubuntu} +LINUX_VER=${LINUX_VER:-22.04} +DOCA_VERSION=${DOCA_VERSION:-2.6.0} # Exit early if nothing to do if [[ ${MORPHEUS_SUPPORT_DOCA} != @(TRUE|ON) ]]; then exit 0 fi -DOCA_REPO_HOST=${DOCA_REPO_HOST:?"Must set \$DOCA_REPO_HOST to build DOCA."} -DOCA_VERSION=${DOCA_VERSION:-2.6.0-0.0.1} WORKING_DIR=$1 echo "Installing DOCA using directory: ${WORKING_DIR}" @@ -33,15 +34,64 @@ DEB_DIR=${WORKING_DIR}/deb mkdir -p ${DEB_DIR} -# Download all files with -nc to skip download if its already there -wget -nc -P ${DEB_DIR} https://${DOCA_REPO_HOST}/doca-repo-2.6.0/doca-repo-2.6.0-0.0.1-240205-083002-daily/doca-host-repo-ubuntu2204_2.6.0-0.0.1-240205-083002-daily.2.6.0058.1.24.01.0.3.3.1_amd64.deb -# Install the doca host repo -dpkg -i ${DEB_DIR}/doca-host-repo*.deb +DOCA_REPO_LINK="https://linux.mellanox.com/public/repo/doca/${DOCA_VERSION}" +DOCA_REPO="${DOCA_REPO_LINK}/ubuntu22.04" +DOCA_REPO_ARCH="x86_64" +DOCA_UPSTREAM_REPO="${DOCA_REPO}/${DOCA_REPO_ARCH}" -# Install all other packages -apt-get update -# apt-get install -y libjson-c-dev meson cmake pkg-config -apt-get install -y doca-sdk doca-runtime doca-gpu doca-gpu-dev +# Upgrade the base packages (diff between image and Canonical upstream repo) +apt update -y +apt upgrade -y + +# Cleanup apt +rm -rf /var/lib/apt/lists/* +apt autoremove -y + +# Configure DOCA Repository, and install packages +apt update -y + +# Install wget & Add the DOCA public repository +apt install -y --no-install-recommends wget software-properties-common gpg-agent +wget -qO - ${DOCA_UPSTREAM_REPO}/GPG-KEY-Mellanox.pub | apt-key add - +add-apt-repository "deb [trusted=yes] ${DOCA_UPSTREAM_REPO} ./" +apt update -y + +# Install base-rt content +apt install -y --no-install-recommends \ + doca-gpu \ + doca-gpu-dev \ + doca-prime-runtime \ + doca-prime-sdk \ + doca-sdk \ + dpcp \ + flexio \ + ibacm \ + ibverbs-utils \ + librdmacm1 \ + libibnetdisc5 \ + libibumad3 \ + libibmad5 \ + libopensm \ + libopenvswitch \ + libyara8 \ + mlnx-tools \ + ofed-scripts \ + openmpi \ + openvswitch-common \ + openvswitch-switch \ + srptools \ + mlnx-ethtool \ + mlnx-iproute2 \ + python3-pyverbs \ + rdma-core \ + ucx \ + yara + + # Cleanup apt +rm -rf /usr/lib/python3/dist-packages +apt remove -y software-properties-common gpg-agent +rm -rf /var/lib/apt/lists/* +apt autoremove -y # Now install the gdrcopy library according to: https://github.com/NVIDIA/gdrcopy GDRCOPY_DIR=${WORKING_DIR}/gdrcopy diff --git a/docker/run_container_release.sh b/docker/run_container_release.sh index 1086d5eb39..7a60d75faf 100755 --- a/docker/run_container_release.sh +++ b/docker/run_container_release.sh @@ -27,8 +27,12 @@ x="\033[0m" # Change to the script file to ensure we are in the correct repo (in case were in a submodule) pushd ${SCRIPT_DIR} &> /dev/null +MORPHEUS_SUPPORT_DOCA=${MORPHEUS_SUPPORT_DOCA:-OFF} + DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"nvcr.io/nvidia/morpheus/morpheus"} DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"$(git describe --tags --abbrev=0)-runtime"} + +# This variable is used for passing extra arguments to the docker run command. Do not use DOCKER_ARGS for this purpose. DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} popd &> /dev/null @@ -40,6 +44,18 @@ if [[ -n "${SSH_AUTH_SOCK}" ]]; then DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" fi +# DPDK requires hugepage and privileged container +DOCA_EXTRA_ARGS="" +if [[ ${MORPHEUS_SUPPORT_DOCA} == @(TRUE|ON) ]]; then + echo -e "${b}Enabling DOCA Support. Mounting /dev/hugepages and running in privileged mode${x}" + + DOCKER_ARGS="${DOCKER_ARGS} -v /dev/hugepages:/dev/hugepages --privileged" +fi + + echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" -docker run --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" +# Enable command logging to show what is being executed +set -x +docker run ${DOCA_EXTRA_ARGS} --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" +set +x diff --git a/docs/source/cloud_deployment_guide.md b/docs/source/cloud_deployment_guide.md index ddf5c63142..4825ef2412 100644 --- a/docs/source/cloud_deployment_guide.md +++ b/docs/source/cloud_deployment_guide.md @@ -725,7 +725,7 @@ On your AWS EC2 G4 instance, follow the instructions in the linked document to i ### Prerequisites 1. NVIDIA-Certified System -2. NVIDIA Pascal GPU or newer (Compute Capability >= 6.0) +2. NVIDIA Volta GPU or newer (Compute Capability >= 7.0) 3. Ubuntu 20.04 LTS or newer ## Installing Cloud Native Core Stack on NVIDIA Certified Systems diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index 7064574855..66ad68fcb6 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -68,7 +68,7 @@ All of the following instructions assume several variables have been set: - `PYTHON_VER`: The desired Python version. Minimum required is `3.10` - `RAPIDS_VER`: The desired RAPIDS version for all RAPIDS libraries including cuDF and RMM. If in doubt use `23.06` - `TRITONCLIENT_VERSION`: The desired Triton client. If in doubt use `22.10` - - `CUDA_VER`: The desired CUDA version to use. If in doubt use `11.8` + - `CUDA_VER`: The desired CUDA version to use. If in doubt use `12.1` ### Clone the repository and pull large file data from Git LFS @@ -77,7 +77,7 @@ All of the following instructions assume several variables have been set: export PYTHON_VER=3.10 export RAPIDS_VER=23.06 export TRITONCLIENT_VERSION=22.10 -export CUDA_VER=11.8 +export CUDA_VER=12.1 export MORPHEUS_ROOT=$(pwd)/morpheus git clone https://github.com/nv-morpheus/Morpheus.git $MORPHEUS_ROOT cd $MORPHEUS_ROOT @@ -173,9 +173,8 @@ Note: These instructions assume the user is using `mamba` instead of `conda` sin #### Prerequisites -- Pascal architecture GPU or better -- NVIDIA driver `520.61.05` or higher -- [CUDA 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) +- Volta architecture GPU or better +- [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive) - `conda` and `mamba` - Refer to the [Getting Started Guide](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) if `conda` is not already installed - Install `mamba`: @@ -191,7 +190,7 @@ Note: These instructions assume the user is using `mamba` instead of `conda` sin ```bash export PYTHON_VER=3.10 export RAPIDS_VER=23.06 - export CUDA_VER=11.8 + export CUDA_VER=12.1 export MORPHEUS_ROOT=$(pwd)/morpheus git clone https://github.com/nv-morpheus/Morpheus.git $MORPHEUS_ROOT cd $MORPHEUS_ROOT diff --git a/docs/source/getting_started.md b/docs/source/getting_started.md index e49a2ba736..5ed797ab48 100644 --- a/docs/source/getting_started.md +++ b/docs/source/getting_started.md @@ -27,8 +27,8 @@ The [pre-built Docker containers](#using-pre-built-docker-containers) are the ea More advanced users, or those who are interested in using the latest pre-release features, will need to [build the Morpheus container](#building-the-morpheus-container) or [build from source](./developer_guide/contributing.md#building-from-source). ## Requirements -- Pascal architecture GPU or better -- NVIDIA driver `520.61.05` or higher +- Volta architecture GPU or better +- [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive) - [Docker](https://docs.docker.com/get-docker/) - [The NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker) - [NVIDIA Triton Inference Server](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) `23.06` or higher diff --git a/examples/abp_nvsmi_detection/nvsmi_data_extract.py b/examples/abp_nvsmi_detection/nvsmi_data_extract.py index 088f0be983..2d99c43efa 100644 --- a/examples/abp_nvsmi_detection/nvsmi_data_extract.py +++ b/examples/abp_nvsmi_detection/nvsmi_data_extract.py @@ -21,7 +21,7 @@ from pynvml.smi import nvidia_smi -def main(): +def main(args): query_opts = NVSMI_QUERY_GPU.copy() # Remove the timestamp and supported clocks from the query @@ -68,6 +68,5 @@ def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--interval-ms', default=1000, help='interval in ms between writes to output file') parser.add_argument("--output-file", default='nvsmi.jsonlines', help='output file to save dataset') - args = parser.parse_args() - main() + main(parser.parse_args()) diff --git a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.cpp b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.cpp index a639bc1a35..3d3c824870 100644 --- a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.cpp +++ b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.cpp @@ -17,10 +17,13 @@ #include "pass_thru.hpp" +#include +#include #include #include // for pymrc::import #include +#include namespace morpheus_example { diff --git a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.hpp b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.hpp index 9670aab1d7..a4be293fcb 100644 --- a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.hpp +++ b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/_lib/pass_thru.hpp @@ -21,9 +21,14 @@ #include // for Segment Builder #include // for Segment Object #include // for PythonNode +#include #include #include +#include + +// IWYU pragma: no_include "morpheus/objects/data_table.hpp" +// IWYU pragma: no_include namespace morpheus_example { diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.cpp b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.cpp index bf7427b773..a8a7d8e0cb 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.cpp +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.cpp @@ -17,16 +17,24 @@ #include "rabbitmq_source.hpp" +#include +#include +#include #include -#include #include -#include // for timedelta->chrono conversions +#include +#include +#include // IWYU pragma: keep #include +#include #include #include #include // for std::this_thread::sleep_for -#include +#include + +// IWYU pragma: no_include +// IWYU pragma: no_include namespace morpheus_rabbit { diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.hpp b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.hpp index 8a1b2ff462..c117bcc5e9 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.hpp +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/_lib/rabbitmq_source.hpp @@ -17,16 +17,22 @@ #pragma once -#include // for AmqpClient::Channel::ptr_t -#include // for cudf::io::table_with_metadata -#include // for MessageMeta -#include // for Segment Builder -#include // for Segment Object -#include // for mrc::pymrc::PythonSource +#include +#include // for cudf::io::table_with_metadata +#include // for MessageMeta +#include // for Segment Builder +#include // for Segment Object +#include // for mrc::pymrc::PythonSource +#include #include // for chrono::milliseconds #include // for shared_ptr #include +#include +#include + +// IWYU pragma: no_include "morpheus/objects/data_table.hpp" +// IWYU pragma: no_include namespace morpheus_rabbit { diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py index d047901ff1..74befdbb51 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py @@ -38,6 +38,8 @@ def __init__(self, config: Config, dfp_arg_parser: DFPArgParser, schema: Schema, self._source_schema_str = pyobj2str(schema.source, encoding=encoding) self._preprocess_schema_str = pyobj2str(schema.preprocess, encoding=encoding) self._input_message_type = pyobj2str(MultiMessage, encoding) + self._start_time_str = self._dfp_arg_parser.time_fields.start_time.isoformat() + self._end_time_str = self._dfp_arg_parser.time_fields.end_time.isoformat() def get_module_conf(self): module_conf = {} @@ -58,8 +60,8 @@ def infer_module_conf(self): "cache_dir": self._dfp_arg_parser.cache_dir, "batching_options": { "sampling_rate_s": self._dfp_arg_parser.sample_rate_s, - "start_time": self._dfp_arg_parser.time_fields.start_time, - "end_time": self._dfp_arg_parser.time_fields.end_time, + "start_time": self._start_time_str, + "end_time": self._end_time_str, "iso_date_regex_pattern": iso_date_regex_pattern, "parser_kwargs": { "lines": False, "orient": "records" @@ -112,8 +114,8 @@ def train_module_conf(self): "cache_dir": self._dfp_arg_parser.cache_dir, "batching_options": { "sampling_rate_s": self._dfp_arg_parser.sample_rate_s, - "start_time": self._dfp_arg_parser.time_fields.start_time, - "end_time": self._dfp_arg_parser.time_fields.end_time, + "start_time": self._start_time_str, + "end_time": self._end_time_str, "iso_date_regex_pattern": iso_date_regex_pattern, "parser_kwargs": { "lines": False, "orient": "records" diff --git a/examples/doca/run.py b/examples/doca/run.py index 3f27a453bc..a2a4415f04 100644 --- a/examples/doca/run.py +++ b/examples/doca/run.py @@ -112,7 +112,9 @@ def run_pipeline(pipeline_batch_size, # add doca source stage pipeline.set_source(DocaSourceStage(config, nic_addr, gpu_addr, traffic_type)) - pipeline.add_stage(MonitorStage(config, description="DOCA GPUNetIO rate", unit='pkts')) + + if traffic_type == 'udp': + pipeline.add_stage(MonitorStage(config, description="DOCA GPUNetIO rate", unit='pkts')) if traffic_type == 'tcp': # add deserialize stage diff --git a/examples/gnn_fraud_detection_pipeline/README.md b/examples/gnn_fraud_detection_pipeline/README.md index 715aaf4b80..c61f288499 100644 --- a/examples/gnn_fraud_detection_pipeline/README.md +++ b/examples/gnn_fraud_detection_pipeline/README.md @@ -21,7 +21,7 @@ limitations under the License. Prior to running the GNN fraud detection pipeline, additional requirements must be installed in to your Conda environment. A supplemental requirements file has been provided in this example directory. ```bash -export CUDA_VER=11.8 +export CUDA_VER=12.1 mamba env update \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml diff --git a/examples/llm/vdb_upload/module/schema_transform.py b/examples/llm/vdb_upload/module/schema_transform.py index e4ddd57699..43e86c3dd4 100644 --- a/examples/llm/vdb_upload/module/schema_transform.py +++ b/examples/llm/vdb_upload/module/schema_transform.py @@ -96,12 +96,12 @@ def _schema_transform(builder: mrc.Builder): raise - schema_config = validated_config.schema_transform_config + schema_config: dict[str, dict[str, Any]] = validated_config.schema_transform_config source_column_info = [] preserve_columns = [] - for col_name, col_config in schema_config.items(): + for col_name, col_config in schema_config.items(): # pylint: disable=no-member op_type = col_config.get("op_type") if (op_type == "rename"): # Handling renamed columns diff --git a/models/training-tuning-scripts/abp-models/abp_nvsmi_xgb_training.py b/models/training-tuning-scripts/abp-models/abp_nvsmi_xgb_training.py index 3d2142e3ad..0f670461a2 100644 --- a/models/training-tuning-scripts/abp-models/abp_nvsmi_xgb_training.py +++ b/models/training-tuning-scripts/abp-models/abp_nvsmi_xgb_training.py @@ -100,10 +100,10 @@ def save_model(model): # print("Validation_score: ", acc) -def main(): +def main(trainingdata): print('Preprocessing...') (x_train, x_test, y_train, y_test) = \ - train_val_split(preprocess(args.trainingdata)) + train_val_split(preprocess(trainingdata)) print('Model Training...') model = train(x_train, x_test, y_train, y_test) print('Saving Model') @@ -118,4 +118,4 @@ def main(): parser.add_argument('--trainingdata', required=True, help='Labelled data in JSON format') args = parser.parse_args() - main() + main(args.trainingdata) diff --git a/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py b/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py index 3bb266c564..7a32ac5156 100644 --- a/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py +++ b/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py @@ -30,7 +30,7 @@ from morpheus.utils.seed import manual_seed -def main(): +def main(args): x_train = pd.read_csv(args.trainingdata) x_val = pd.read_csv(args.valdata) @@ -114,6 +114,5 @@ def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--trainingdata", required=True, help="CloudTrail CSV") parser.add_argument("--valdata", required=True, help="CloudTrail CSV") - args = parser.parse_args() - main() + main(parser.parse_args()) diff --git a/models/training-tuning-scripts/root-cause-models/root_cause_bert.py b/models/training-tuning-scripts/root-cause-models/root_cause_bert.py index 850941ae98..caa92384c4 100644 --- a/models/training-tuning-scripts/root-cause-models/root_cause_bert.py +++ b/models/training-tuning-scripts/root-cause-models/root_cause_bert.py @@ -85,7 +85,7 @@ def train(trainingdata, unseenerrors): print(f1_score(true_labels, tests)) -def main(): +def main(args): train(args.trainingdata, args.unseenerrors) @@ -97,6 +97,5 @@ def main(): required=True, help="""Labelled data to be added to test set for evaluation after training""") - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py b/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py index ff5cba8a21..e41f3a4405 100644 --- a/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py +++ b/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py @@ -205,7 +205,7 @@ def model_eval(model, val_dataloader, idx2label): print(cfm) -def main(): +def main(args): print("Data Preprocessing...") train_dataloader, val_dataloader, idx2label = data_preprocessing(args.training_data) print("Model Training...") @@ -227,6 +227,5 @@ def main(): with model file") parser.add_argument("--tokenizer-hash-filepath", required=True, help="hash file for tokenizer vocab") parser.add_argument("--output-file", required=True, help="output file to save new model") - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/models/validation-inference-scripts/abp-models/abp_inference.py b/models/validation-inference-scripts/abp-models/abp_inference.py index 572bee8eee..3ee354a621 100644 --- a/models/validation-inference-scripts/abp-models/abp_inference.py +++ b/models/validation-inference-scripts/abp-models/abp_inference.py @@ -54,7 +54,7 @@ def infer(validationdata, model, output): df2.to_json(output, orient='records', lines=True) -def main(): +def main(args): infer(args.validationdata, args.model, args.output) @@ -64,6 +64,5 @@ def main(): parser.add_argument("--validationdata", required=True, help="Labelled data in JSON format") parser.add_argument("--model", required=True, help="trained model") parser.add_argument("--output", required=True, help="output filename") - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/models/validation-inference-scripts/dfp-models/hammah_inference.py b/models/validation-inference-scripts/dfp-models/hammah_inference.py index 2fa712994d..e928a6332c 100644 --- a/models/validation-inference-scripts/dfp-models/hammah_inference.py +++ b/models/validation-inference-scripts/dfp-models/hammah_inference.py @@ -119,7 +119,7 @@ def back_to_string(obj): x_validation.to_csv(output, index=False) -def main(): +def main(args): infer(args.validationdata, args.model, args.output) @@ -129,6 +129,5 @@ def main(): parser.add_argument("--validationdata", required=True, help="Labelled data in JSON format") parser.add_argument("--model", required=True, help="trained model") parser.add_argument("--output", required=True, help="output filename") - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/models/validation-inference-scripts/phishing-models/phish_bert_inference_script.py b/models/validation-inference-scripts/phishing-models/phish_bert_inference_script.py index 6189afb641..5327788e0d 100644 --- a/models/validation-inference-scripts/phishing-models/phish_bert_inference_script.py +++ b/models/validation-inference-scripts/phishing-models/phish_bert_inference_script.py @@ -87,7 +87,7 @@ def bert_uncased_tokenize(strings, max_seq_len): df.to_json(output, orient='records', lines=True) -def main(): +def main(args): infer(args.validationdata, args.vocab, args.model, args.output) @@ -98,6 +98,5 @@ def main(): parser.add_argument("--vocab", required=True, help="BERT voabulary file") parser.add_argument("--model", required=True, help="pretrained model") parser.add_argument("--output", required=True, help="output filename") - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/models/validation-inference-scripts/root-cause-models/root_cause_inference.py b/models/validation-inference-scripts/root-cause-models/root_cause_inference.py index fa0542b112..18a6609389 100644 --- a/models/validation-inference-scripts/root-cause-models/root_cause_inference.py +++ b/models/validation-inference-scripts/root-cause-models/root_cause_inference.py @@ -99,7 +99,7 @@ def bert_uncased_tokenize(strings, max_seq_len): df.to_json(output, orient='records', lines=True) -def main(): +def main(args): infer(args.validationdata, args.vocab, args.model, args.output) @@ -110,6 +110,5 @@ def main(): parser.add_argument('--vocab', required=True, help='BERT voabulary file') parser.add_argument('--model', required=True, help='pretrained model') parser.add_argument('--output', required=True, help='output filename') - args = parser.parse_args() -main() + main(parser.parse_args()) diff --git a/morpheus.code-workspace b/morpheus.code-workspace index 49801e0482..cbeadce076 100644 --- a/morpheus.code-workspace +++ b/morpheus.code-workspace @@ -12,8 +12,7 @@ "ms-python.isort", "ms-python.pylint", "ms-vscode.cmake-tools", - "stkb.rewrap", - "twxs.cmake" + "stkb.rewrap" ], // List of extensions recommended by VS Code that should not be recommended for users of this workspace. "unwantedRecommendations": [ @@ -697,6 +696,38 @@ "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, "rewrap.wrappingColumn": 120, + "testMate.cpp.test.advancedExecutables": [ + { + "pattern": "{build,Build,BUILD,out,Out,OUT}/**/*{test,Test,TEST}_*.x", + "env": { + "UCX_ERROR_SIGNALS": "", // Prevent UCX from capturing errors + "MORPHEUS_ROOT": "${workspaceFolder}", // Ensure the MORPHEUS_ROOT is set + } + } + ], + "testMate.cpp.debug.configTemplate": { + "args": "${argsArray}", + "cwd": "${cwd}", + "darwin": { + "MIMode": "lldb", + "type": "cppdbg" + }, + "env": "${envObj}", + "environment": "${envObjArray}", + "name": "Debug Test Runner (${parentLabel} > ${label})", + "program": "${exec}", + "request": "launch", + "testMate.cpp.debug.setEnv": { + "GLOG_v": "10", + "GTEST_CATCH_EXCEPTIONS": "0", // Allow the debugger to catch exceptions + "MORPHEUS_ROOT": "${workspaceFolder}", // Ensure the MORPHEUS_ROOT is set + "UCX_ERROR_SIGNALS": "" // Prevent UCX from capturing errors + }, + "type": "lldb", + "win32": { + "type": "cppvsdbg" + } + }, "yapf.args": [ "--style=${workspaceFolder}/setup.cfg" ] diff --git a/morpheus/_lib/doca/CMakeLists.txt b/morpheus/_lib/doca/CMakeLists.txt index 2578ca02a3..9cdf7a7e44 100644 --- a/morpheus/_lib/doca/CMakeLists.txt +++ b/morpheus/_lib/doca/CMakeLists.txt @@ -50,7 +50,7 @@ target_link_libraries(morpheus_doca file(GLOB_RECURSE morpheus_doca_public_headers LIST_DIRECTORIES FALSE CONFIGURE_DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/include/morpheus/doca/*" ) # Add headers to target sources file_set so they can be installed @@ -59,6 +59,7 @@ target_sources(morpheus_doca PUBLIC FILE_SET public_headers TYPE HEADERS + BASE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" FILES ${morpheus_doca_public_headers} ) @@ -76,17 +77,23 @@ set_target_properties(morpheus_doca CUDA_SEPARABLE_COMPILATION ON ) +if (MORPHEUS_PYTHON_INPLACE_BUILD) + morpheus_utils_inplace_build_copy(morpheus_doca ${CMAKE_CURRENT_SOURCE_DIR}) +endif() + +# ################################################################################################## +# - install targets -------------------------------------------------------------------------------- + +# Get the library directory in a cross-platform way +rapids_cmake_install_lib_dir(lib_dir) + install( TARGETS morpheus_doca EXPORT ${PROJECT_NAME}-core-exports + LIBRARY + DESTINATION ${lib_dir} FILE_SET public_headers - COMPONENT - Wheel ) - -if (MORPHEUS_PYTHON_INPLACE_BUILD) - morpheus_utils_inplace_build_copy(morpheus_doca ${CMAKE_CURRENT_SOURCE_DIR}) -endif() diff --git a/morpheus/_lib/doca/include/common.hpp b/morpheus/_lib/doca/include/morpheus/doca/common.hpp similarity index 100% rename from morpheus/_lib/doca/include/common.hpp rename to morpheus/_lib/doca/include/morpheus/doca/common.hpp diff --git a/morpheus/_lib/doca/include/doca_context.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_context.hpp similarity index 92% rename from morpheus/_lib/doca/include/doca_context.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_context.hpp index 5fbb0e3b09..018c7aca94 100644 --- a/morpheus/_lib/doca/include/doca_context.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_context.hpp @@ -17,9 +17,9 @@ #pragma once -#include "common.hpp" -#include "error.hpp" -#include "rte_context.hpp" +#include "morpheus/doca/common.hpp" +#include "morpheus/doca/error.hpp" +#include "morpheus/doca/rte_context.hpp" #include #include diff --git a/morpheus/_lib/doca/include/doca_mem.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_mem.hpp similarity index 93% rename from morpheus/_lib/doca/include/doca_mem.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_mem.hpp index 255e55e562..a290326529 100644 --- a/morpheus/_lib/doca/include/doca_mem.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_mem.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,8 +17,8 @@ #pragma once -#include "doca_context.hpp" -#include "error.hpp" +#include "morpheus/doca/doca_context.hpp" +#include "morpheus/doca/error.hpp" #include diff --git a/morpheus/_lib/doca/include/doca_rx_pipe.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_rx_pipe.hpp similarity index 95% rename from morpheus/_lib/doca/include/doca_rx_pipe.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_rx_pipe.hpp index 39d44713f7..4c7940ac22 100644 --- a/morpheus/_lib/doca/include/doca_rx_pipe.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_rx_pipe.hpp @@ -17,8 +17,8 @@ #pragma once -#include "doca_context.hpp" -#include "doca_rx_queue.hpp" +#include "morpheus/doca/doca_context.hpp" +#include "morpheus/doca/doca_rx_queue.hpp" namespace morpheus::doca { diff --git a/morpheus/_lib/doca/include/doca_rx_queue.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_rx_queue.hpp similarity index 94% rename from morpheus/_lib/doca/include/doca_rx_queue.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_rx_queue.hpp index 5d5f162151..537061954c 100644 --- a/morpheus/_lib/doca/include/doca_rx_queue.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_rx_queue.hpp @@ -17,8 +17,8 @@ #pragma once -#include "doca_context.hpp" -#include "doca_mem.hpp" +#include "morpheus/doca/doca_context.hpp" +#include "morpheus/doca/doca_mem.hpp" #include #include diff --git a/morpheus/_lib/doca/include/doca_semaphore.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_semaphore.hpp similarity index 97% rename from morpheus/_lib/doca/include/doca_semaphore.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_semaphore.hpp index 9a1fbc3f6c..635455b442 100644 --- a/morpheus/_lib/doca/include/doca_semaphore.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_semaphore.hpp @@ -17,7 +17,7 @@ #pragma once -#include "doca_context.hpp" +#include "morpheus/doca/doca_context.hpp" namespace morpheus::doca { diff --git a/morpheus/_lib/doca/include/doca_source.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_source.hpp similarity index 98% rename from morpheus/_lib/doca/include/doca_source.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_source.hpp index 3b9d8ea9d1..90882d10b5 100644 --- a/morpheus/_lib/doca/include/doca_source.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/doca_source.hpp @@ -17,8 +17,7 @@ #pragma once -#include "common.hpp" - +#include "morpheus/doca/common.hpp" #include "morpheus/messages/meta.hpp" #include diff --git a/morpheus/_lib/doca/include/doca_source_kernels.hpp b/morpheus/_lib/doca/include/morpheus/doca/doca_source_kernels.hpp similarity index 100% rename from morpheus/_lib/doca/include/doca_source_kernels.hpp rename to morpheus/_lib/doca/include/morpheus/doca/doca_source_kernels.hpp diff --git a/morpheus/_lib/doca/include/error.hpp b/morpheus/_lib/doca/include/morpheus/doca/error.hpp similarity index 98% rename from morpheus/_lib/doca/include/error.hpp rename to morpheus/_lib/doca/include/morpheus/doca/error.hpp index 90cfc97671..db396f1fbc 100644 --- a/morpheus/_lib/doca/include/error.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/error.hpp @@ -17,8 +17,9 @@ #pragma once +#include "morpheus/utilities/string_util.hpp" + #include -#include #include diff --git a/morpheus/_lib/doca/include/rte_context.hpp b/morpheus/_lib/doca/include/morpheus/doca/rte_context.hpp similarity index 93% rename from morpheus/_lib/doca/include/rte_context.hpp rename to morpheus/_lib/doca/include/morpheus/doca/rte_context.hpp index c8c447e14d..28ab636568 100644 --- a/morpheus/_lib/doca/include/rte_context.hpp +++ b/morpheus/_lib/doca/include/morpheus/doca/rte_context.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/morpheus/_lib/doca/module.cpp b/morpheus/_lib/doca/module.cpp index fdf06cdcff..4aab98aaa1 100644 --- a/morpheus/_lib/doca/module.cpp +++ b/morpheus/_lib/doca/module.cpp @@ -15,7 +15,7 @@ * limitations under the License. */ -#include "doca_source.hpp" +#include "morpheus/doca/doca_source.hpp" #include // IWYU pragma: keep #include diff --git a/morpheus/_lib/doca/src/doca_context.cpp b/morpheus/_lib/doca/src/doca_context.cpp index 87e7a350d4..0ec12c3c07 100644 --- a/morpheus/_lib/doca/src/doca_context.cpp +++ b/morpheus/_lib/doca/src/doca_context.cpp @@ -15,11 +15,10 @@ * limitations under the License. */ -#include "doca_context.hpp" - -#include "common.hpp" -#include "error.hpp" +#include "morpheus/doca/doca_context.hpp" +#include "morpheus/doca/common.hpp" +#include "morpheus/doca/error.hpp" #include "morpheus/utilities/error.hpp" #include @@ -97,11 +96,11 @@ doca_flow_port* init_doca_flow(uint16_t port_id, uint8_t rxq_num) rte_eth_dev_info dev_info = {nullptr}; rte_eth_conf eth_conf = { .rxmode = - { + { .mtu = 2048, /* Not really used, just to initialize DPDK */ }, .txmode = - { + { .offloads = RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM, }, }; diff --git a/morpheus/_lib/doca/src/doca_rx_pipe.cpp b/morpheus/_lib/doca/src/doca_rx_pipe.cpp index 8879da66ad..32f00253a2 100644 --- a/morpheus/_lib/doca/src/doca_rx_pipe.cpp +++ b/morpheus/_lib/doca/src/doca_rx_pipe.cpp @@ -15,7 +15,7 @@ * limitations under the License. */ -#include "doca_rx_pipe.hpp" +#include "morpheus/doca/doca_rx_pipe.hpp" #include #include @@ -37,7 +37,7 @@ DocaRxPipe::DocaRxPipe(std::shared_ptr context, doca_flow_match match_mask{0}; doca_flow_match match{}; - match.outer.l3_type = DOCA_FLOW_L3_TYPE_IP4; + match.outer.l3_type = DOCA_FLOW_L3_TYPE_IP4; if (m_traffic_type == DOCA_TRAFFIC_TYPE_TCP) { match.outer.ip4.next_proto = IPPROTO_TCP; @@ -50,14 +50,14 @@ DocaRxPipe::DocaRxPipe(std::shared_ptr context, } doca_flow_fwd fwd{}; - fwd.type = DOCA_FLOW_FWD_RSS; + fwd.type = DOCA_FLOW_FWD_RSS; if (m_traffic_type == DOCA_TRAFFIC_TYPE_TCP) fwd.rss_outer_flags = DOCA_FLOW_RSS_IPV4 | DOCA_FLOW_RSS_TCP; else fwd.rss_outer_flags = DOCA_FLOW_RSS_IPV4 | DOCA_FLOW_RSS_UDP; - fwd.rss_queues = rss_queues.begin(); - fwd.num_of_queues = m_rxq.size(); + fwd.rss_queues = rss_queues.begin(); + fwd.num_of_queues = m_rxq.size(); doca_flow_fwd miss_fwd{}; miss_fwd.type = DOCA_FLOW_FWD_DROP; @@ -68,13 +68,13 @@ DocaRxPipe::DocaRxPipe(std::shared_ptr context, doca_flow_pipe_cfg pipe_cfg{}; pipe_cfg.attr.name = "GPU_RXQ_PIPE"; pipe_cfg.attr.enable_strict_matching = true; - pipe_cfg.attr.type = DOCA_FLOW_PIPE_BASIC; - pipe_cfg.attr.nb_actions = 0; - pipe_cfg.attr.is_root = false; - pipe_cfg.match = &match; - pipe_cfg.match_mask = &match_mask; - pipe_cfg.monitor = &monitor; - pipe_cfg.port = context->flow_port(); + pipe_cfg.attr.type = DOCA_FLOW_PIPE_BASIC; + pipe_cfg.attr.nb_actions = 0; + pipe_cfg.attr.is_root = false; + pipe_cfg.match = &match; + pipe_cfg.match_mask = &match_mask; + pipe_cfg.monitor = &monitor; + pipe_cfg.port = context->flow_port(); DOCA_TRY(doca_flow_pipe_create(&pipe_cfg, &fwd, &miss_fwd, &m_pipe)); @@ -90,7 +90,7 @@ DocaRxPipe::DocaRxPipe(std::shared_ptr context, doca_flow_monitor root_monitor = {}; root_monitor.counter_type = DOCA_FLOW_RESOURCE_TYPE_NON_SHARED; - doca_flow_pipe_cfg root_pipe_cfg = {}; + doca_flow_pipe_cfg root_pipe_cfg = {}; root_pipe_cfg.attr.name = "ROOT_PIPE"; root_pipe_cfg.attr.enable_strict_matching = true; root_pipe_cfg.attr.is_root = true; diff --git a/morpheus/_lib/doca/src/doca_rx_queue.cpp b/morpheus/_lib/doca/src/doca_rx_queue.cpp index 0e9b9c1dfd..5b802e871d 100644 --- a/morpheus/_lib/doca/src/doca_rx_queue.cpp +++ b/morpheus/_lib/doca/src/doca_rx_queue.cpp @@ -15,11 +15,10 @@ * limitations under the License. */ -#include "doca_rx_queue.hpp" - -#include "common.hpp" -#include "error.hpp" +#include "morpheus/doca/doca_rx_queue.hpp" +#include "morpheus/doca/common.hpp" +#include "morpheus/doca/error.hpp" #include "morpheus/utilities/error.hpp" #include diff --git a/morpheus/_lib/doca/src/doca_semaphore.cpp b/morpheus/_lib/doca/src/doca_semaphore.cpp index 71298d2e5b..d0da096d7c 100644 --- a/morpheus/_lib/doca/src/doca_semaphore.cpp +++ b/morpheus/_lib/doca/src/doca_semaphore.cpp @@ -15,7 +15,7 @@ * limitations under the License. */ -#include "doca_semaphore.hpp" +#include "morpheus/doca/doca_semaphore.hpp" namespace morpheus::doca { diff --git a/morpheus/_lib/doca/src/doca_source.cpp b/morpheus/_lib/doca/src/doca_source.cpp index 320b2cbf8d..b855f82dcc 100644 --- a/morpheus/_lib/doca/src/doca_source.cpp +++ b/morpheus/_lib/doca/src/doca_source.cpp @@ -15,14 +15,13 @@ * limitations under the License. */ -#include "doca_source.hpp" - -#include "doca_context.hpp" -#include "doca_rx_pipe.hpp" -#include "doca_rx_queue.hpp" -#include "doca_semaphore.hpp" -#include "doca_source_kernels.hpp" +#include "morpheus/doca/doca_source.hpp" +#include "morpheus/doca/doca_context.hpp" +#include "morpheus/doca/doca_rx_pipe.hpp" +#include "morpheus/doca/doca_rx_queue.hpp" +#include "morpheus/doca/doca_semaphore.hpp" +#include "morpheus/doca/doca_source_kernels.hpp" #include "morpheus/utilities/error.hpp" #include @@ -129,7 +128,7 @@ DocaSourceStage::subscriber_fn_t DocaSourceStage::build() if (thread_idx >= MAX_QUEUE) { - MORPHEUS_FAIL(MORPHEUS_CONCAT_STR("Thread ID " << thread_idx << " bigger than MAX_QUEUE " << MAX_QUEUE)); + MORPHEUS_FAIL("More CPU threads than allowed queues"); } payload_buffer_d.reserve(MAX_SEM_X_QUEUE); diff --git a/morpheus/_lib/doca/src/doca_source_kernels.cu b/morpheus/_lib/doca/src/doca_source_kernels.cu index d72e6ec1a8..773539e14b 100644 --- a/morpheus/_lib/doca/src/doca_source_kernels.cu +++ b/morpheus/_lib/doca/src/doca_source_kernels.cu @@ -15,7 +15,7 @@ * limitations under the License. */ -#include "common.hpp" +#include "morpheus/doca/common.hpp" #include "morpheus/utilities/error.hpp" @@ -391,7 +391,7 @@ __global__ void _packet_receive_kernel( auto epoch = now_ms.time_since_epoch(); pkt_info->timestamp_out[packet_idx] = epoch.count(); } - + // if (threadIdx.x == 0) DEVICE_GET_TIME(reduce_start); auto payload_size_total = BlockReduce(temp_storage).Sum(_payload_sizes); __syncthreads(); diff --git a/morpheus/_lib/doca/src/rte_context.cpp b/morpheus/_lib/doca/src/rte_context.cpp index 8576cb0444..705da11235 100644 --- a/morpheus/_lib/doca/src/rte_context.cpp +++ b/morpheus/_lib/doca/src/rte_context.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,9 +15,9 @@ * limitations under the License. */ -#include "rte_context.hpp" +#include "morpheus/doca/rte_context.hpp" -#include "error.hpp" +#include "morpheus/doca/error.hpp" #include #include diff --git a/morpheus/_lib/include/morpheus/io/data_loader_registry.hpp b/morpheus/_lib/include/morpheus/io/data_loader_registry.hpp index 325dad98c3..1c038c2617 100644 --- a/morpheus/_lib/include/morpheus/io/data_loader_registry.hpp +++ b/morpheus/_lib/include/morpheus/io/data_loader_registry.hpp @@ -24,7 +24,6 @@ #include #include -#include #include #include diff --git a/morpheus/_lib/include/morpheus/llm/input_map.hpp b/morpheus/_lib/include/morpheus/llm/input_map.hpp index cc4a5d3851..7bc7c1750f 100644 --- a/morpheus/_lib/include/morpheus/llm/input_map.hpp +++ b/morpheus/_lib/include/morpheus/llm/input_map.hpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include namespace morpheus::llm { diff --git a/morpheus/_lib/include/morpheus/llm/llm_node_runner.hpp b/morpheus/_lib/include/morpheus/llm/llm_node_runner.hpp index a0d4845ae0..d15c2d41f9 100644 --- a/morpheus/_lib/include/morpheus/llm/llm_node_runner.hpp +++ b/morpheus/_lib/include/morpheus/llm/llm_node_runner.hpp @@ -20,7 +20,6 @@ #include "morpheus/export.h" #include "morpheus/llm/fwd.hpp" #include "morpheus/llm/input_map.hpp" -#include "morpheus/llm/llm_node_base.hpp" #include "morpheus/types.hpp" #include diff --git a/morpheus/_lib/include/morpheus/messages/control.hpp b/morpheus/_lib/include/morpheus/messages/control.hpp index 8ee020c76d..2eb45dea7e 100644 --- a/morpheus/_lib/include/morpheus/messages/control.hpp +++ b/morpheus/_lib/include/morpheus/messages/control.hpp @@ -17,17 +17,17 @@ #pragma once -#include "morpheus/messages/meta.hpp" +#include "morpheus/messages/meta.hpp" // for MessageMeta -#include -#include +#include // for json, basic_json +#include // for object, dict, list, none -#include -#include -#include -#include -#include -#include +#include // for system_clock, time_point +#include // for map +#include // for shared_ptr +#include // for optional +#include // for string +#include // for vector namespace morpheus { diff --git a/morpheus/_lib/include/morpheus/messages/memory/inference_memory_fil.hpp b/morpheus/_lib/include/morpheus/messages/memory/inference_memory_fil.hpp index 9b78533218..0fb7b2882b 100644 --- a/morpheus/_lib/include/morpheus/messages/memory/inference_memory_fil.hpp +++ b/morpheus/_lib/include/morpheus/messages/memory/inference_memory_fil.hpp @@ -17,6 +17,7 @@ #pragma once +#include "morpheus/export.h" #include "morpheus/messages/memory/inference_memory.hpp" #include "morpheus/objects/tensor_object.hpp" #include "morpheus/types.hpp" // for TensorIndex @@ -34,12 +35,11 @@ namespace morpheus { * @{ * @file */ - /** * This is a container class for data that needs to be submitted to the inference server for FIL category * usecases. */ -class InferenceMemoryFIL : public InferenceMemory +class MORPHEUS_EXPORT InferenceMemoryFIL : public InferenceMemory { public: /** diff --git a/morpheus/_lib/include/morpheus/messages/meta.hpp b/morpheus/_lib/include/morpheus/messages/meta.hpp index 24687013a8..11439b7e10 100644 --- a/morpheus/_lib/include/morpheus/messages/meta.hpp +++ b/morpheus/_lib/include/morpheus/messages/meta.hpp @@ -19,6 +19,7 @@ #include "morpheus/objects/data_table.hpp" // for IDataTable #include "morpheus/objects/table_info.hpp" +#include "morpheus/objects/tensor_object.hpp" #include "morpheus/types.hpp" // for TensorIndex #include @@ -30,6 +31,7 @@ #include namespace morpheus { + #pragma GCC visibility push(default) /****** Component public implementations ******************/ /****** MessageMeta****************************************/ @@ -64,6 +66,38 @@ class MessageMeta */ virtual TableInfo get_info() const; + /** + * @brief Get the info object for a specific column + * + * @param col_name The name of the column to slice + * @return TableInfo The table info containing only the column specified + */ + virtual TableInfo get_info(const std::string& col_name) const; + + /** + * @brief Get the info object for a specific set of columns + * + * @param column_names The names of the columns to slice + * @return TableInfo The table info containing only the columns specified, in the order specified + */ + virtual TableInfo get_info(const std::vector& column_names) const; + + /** + * @brief Set the data for a single column from a TensorObject + * + * @param col_name The name of the column to set + * @param tensor The tensor to set the column to + */ + virtual void set_data(const std::string& col_name, TensorObject tensor); + + /** + * @brief Set the data for multiple columns from a vector of TensorObjects + * + * @param column_names The names of the columns to set + * @param tensors The tensors to set the columns to + */ + virtual void set_data(const std::vector& column_names, const std::vector& tensors); + /** * TODO(Documentation) */ @@ -188,6 +222,51 @@ struct MessageMetaInterfaceProxy */ static TensorIndex count(MessageMeta& self); + /** + * @brief Gets a DataFrame for all columns + * + * @param self The MessageMeta instance + * @return pybind11::object A python DataFrame containing the info for all columns + */ + static pybind11::object get_data(MessageMeta& self); + + /** + * @brief Get a Series for a single column + * + * @param self The MessageMeta instance + * @param col_name The name of the column to get + * @return pybind11::object A python Series containing the info for the specified column + */ + static pybind11::object get_data(MessageMeta& self, std::string col_name); + + /** + * @brief Get a DataFrame for a set of columns + * + * @param self The MessageMeta instance + * @param columns The names of the columns to get + * @return pybind11::object A python DataFrame containing the info for the specified columns, in the order specified + */ + static pybind11::object get_data(MessageMeta& self, std::vector columns); + + /** + * @brief Gets a DataFrame for all columns. This is only used for overload resolution from python + * + * @param self The MessageMeta instance + * @param none_obj An object of None + * @return pybind11::object A python DataFrame containing the info for all columns + */ + static pybind11::object get_data(MessageMeta& self, pybind11::none none_obj); + + /** + * @brief Set the values for one or more columns from a python object + * + * @param self The MessageMeta instance + * @param columns The names of the columns to set + * @param value The value to set the columns to. This can be a scalar, a list, a numpy array, a Series, or a + * DataFrame. The dimension must match the number of columns according to DataFrame broadcasting rules. + */ + static void set_data(MessageMeta& self, pybind11::object columns, pybind11::object value); + static std::vector get_column_names(MessageMeta& self); /** @@ -197,6 +276,7 @@ struct MessageMetaInterfaceProxy * @return pybind11::object A `DataFrame` object */ static pybind11::object get_data_frame(MessageMeta& self); + static pybind11::object df_property(MessageMeta& self); static MutableTableCtxMgr mutable_dataframe(MessageMeta& self); diff --git a/morpheus/_lib/include/morpheus/objects/memory_descriptor.hpp b/morpheus/_lib/include/morpheus/objects/memory_descriptor.hpp index 61969d9a2e..83d98fc2bd 100644 --- a/morpheus/_lib/include/morpheus/objects/memory_descriptor.hpp +++ b/morpheus/_lib/include/morpheus/objects/memory_descriptor.hpp @@ -20,7 +20,8 @@ #include "morpheus/export.h" #include -#include + +#include "cuda/memory_resource" /** * @brief Struct describing device memory resources. diff --git a/morpheus/_lib/include/morpheus/objects/tensor.hpp b/morpheus/_lib/include/morpheus/objects/tensor.hpp index c095fc77f2..93f5fe3aba 100644 --- a/morpheus/_lib/include/morpheus/objects/tensor.hpp +++ b/morpheus/_lib/include/morpheus/objects/tensor.hpp @@ -21,6 +21,7 @@ #include "morpheus/objects/tensor_object.hpp" #include "morpheus/types.hpp" // for ShapeType, TensorIndex, TensorSize +#include #include #include // for uint8_t diff --git a/morpheus/_lib/include/morpheus/stages/add_classification.hpp b/morpheus/_lib/include/morpheus/stages/add_classification.hpp index db2df629f6..d37981c342 100644 --- a/morpheus/_lib/include/morpheus/stages/add_classification.hpp +++ b/morpheus/_lib/include/morpheus/stages/add_classification.hpp @@ -17,15 +17,18 @@ #pragma once -#include "morpheus/stages/add_scores_stage_base.hpp" +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/stages/add_scores_stage_base.hpp" // for AddScoresStageBase -#include -#include +#include // for Builder +#include // for Object +#include // for trace_activity #include // for size_t -#include -#include -#include +#include // for map +#include // for shared_ptr +#include // for string namespace morpheus { @@ -43,7 +46,8 @@ namespace morpheus { * @brief Add detected classifications to each message. Classification labels based on probabilities calculated in * inference stage. Label indexes will be looked up in the idx2label property. */ -class AddClassificationsStage : public AddScoresStageBase +template +class AddClassificationsStage : public AddScoresStageBase { public: /** @@ -55,6 +59,11 @@ class AddClassificationsStage : public AddScoresStageBase AddClassificationsStage(std::map idx2label, float threshold); }; +using AddClassificationsStageMM = // NOLINT(readability-identifier-naming) + AddClassificationsStage; +using AddClassificationsStageCM = // NOLINT(readability-identifier-naming) + AddClassificationsStage; + /****** AddClassificationStageInterfaceProxy******************/ /** * @brief Interface proxy, used to insulate python bindings. @@ -62,15 +71,33 @@ class AddClassificationsStage : public AddScoresStageBase struct AddClassificationStageInterfaceProxy { /** - * @brief Create and initialize a AddClassificationStage, and return the result + * @brief Create and initialize a AddClassificationStage that receives MultiResponseMessage and emits + * MultiResponseMessage, and return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param idx2label : Index to classification labels map + * @param threshold : Threshold to consider true/false for each class + * @return std::shared_ptr>> + */ + static std::shared_ptr>> + init_multi(mrc::segment::Builder& builder, + const std::string& name, + std::map idx2label, + float threshold); + + /** + * @brief Create and initialize a AddClassificationStage that receives ControlMessage and emits ControlMessage, and + * return the result * * @param builder : Pipeline context object reference * @param name : Name of a stage reference * @param idx2label : Index to classification labels map * @param threshold : Threshold to consider true/false for each class - * @return std::shared_ptr> + * @return std::shared_ptr>> */ - static std::shared_ptr> init( + static std::shared_ptr>> init_cm( mrc::segment::Builder& builder, const std::string& name, std::map idx2label, diff --git a/morpheus/_lib/include/morpheus/stages/add_scores.hpp b/morpheus/_lib/include/morpheus/stages/add_scores.hpp index 7173338294..df133606cb 100644 --- a/morpheus/_lib/include/morpheus/stages/add_scores.hpp +++ b/morpheus/_lib/include/morpheus/stages/add_scores.hpp @@ -17,15 +17,18 @@ #pragma once -#include "morpheus/stages/add_scores_stage_base.hpp" +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/stages/add_scores_stage_base.hpp" // for AddScoresStageBase -#include -#include +#include // for Builder +#include // for Object +#include // for trace_activity #include // for size_t -#include -#include -#include +#include // for map +#include // for shared_ptr +#include // for string namespace morpheus { /****** Component public implementations *******************/ @@ -42,7 +45,8 @@ namespace morpheus { * @brief Add probability scores to each message. Score labels based on probabilities calculated in inference stage. * Label indexes will be looked up in the idx2label property. */ -class AddScoresStage : public AddScoresStageBase +template +class AddScoresStage : public AddScoresStageBase { public: /** @@ -53,6 +57,11 @@ class AddScoresStage : public AddScoresStageBase AddScoresStage(std::map idx2label); }; +using AddScoresStageMM = // NOLINT(readability-identifier-naming) + AddScoresStage; +using AddScoresStageCM = // NOLINT(readability-identifier-naming) + AddScoresStage; + /****** AddScoresStageInterfaceProxy******************/ /** * @brief Interface proxy, used to insulate python bindings. @@ -60,17 +69,30 @@ class AddScoresStage : public AddScoresStageBase struct AddScoresStageInterfaceProxy { /** - * @brief Create and initialize a AddScoresStage, and return the result + * @brief Create and initialize a AddScoresStage that receives MultiResponseMessage and emits MultiResponseMessage, + * and return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param num_class_labels : Number of classification labels + * @param idx2label : Index to classification labels map + * @return std::shared_ptr>> + */ + static std::shared_ptr>> init_multi( + mrc::segment::Builder& builder, const std::string& name, std::map idx2label); + + /** + * @brief Create and initialize a AddScoresStage that receives ControlMessage and emits ControlMessage, + * and return the result * * @param builder : Pipeline context object reference * @param name : Name of a stage reference * @param num_class_labels : Number of classification labels * @param idx2label : Index to classification labels map - * @return std::shared_ptr> + * @return std::shared_ptr>> */ - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - std::map idx2label); + static std::shared_ptr>> init_cm( + mrc::segment::Builder& builder, const std::string& name, std::map idx2label); }; #pragma GCC visibility pop diff --git a/morpheus/_lib/include/morpheus/stages/add_scores_stage_base.hpp b/morpheus/_lib/include/morpheus/stages/add_scores_stage_base.hpp index 024150e7a9..da8dff1214 100644 --- a/morpheus/_lib/include/morpheus/stages/add_scores_stage_base.hpp +++ b/morpheus/_lib/include/morpheus/stages/add_scores_stage_base.hpp @@ -17,25 +17,21 @@ #pragma once -#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/messages/control.hpp" +#include "morpheus/messages/multi_response.hpp" #include -#include -#include -#include -#include -#include -#include #include #include -#include // for size_t +#include #include #include #include #include #include -#include + +// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" namespace morpheus { /****** Component public implementations *******************/ @@ -51,11 +47,11 @@ namespace morpheus { /** * @brief Base class for both `AddScoresStage` and `AddClassificationStage` */ -class AddScoresStageBase - : public mrc::pymrc::PythonNode, std::shared_ptr> +template +class AddScoresStageBase : public mrc::pymrc::PythonNode, std::shared_ptr> { public: - using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; + using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; using typename base_t::sink_type_t; using typename base_t::source_type_t; using typename base_t::subscribe_fn_t; @@ -68,12 +64,14 @@ class AddScoresStageBase */ AddScoresStageBase(std::map idx2label, std::optional threshold); - private: /** * Called every time a message is passed to this stage */ source_type_t on_data(sink_type_t x); + private: + void on_multi_response_message(std::shared_ptr x); + void on_control_message(std::shared_ptr x); std::map m_idx2label; std::optional m_threshold; @@ -81,6 +79,11 @@ class AddScoresStageBase std::size_t m_min_col_count; }; +using AddScoresStageBaseMM = // NOLINT(readability-identifier-naming) + AddScoresStageBase; +using AddScoresStageBaseCM = // NOLINT(readability-identifier-naming) + AddScoresStageBase; + #pragma GCC visibility pop /** @} */ // end of group } // namespace morpheus diff --git a/morpheus/_lib/include/morpheus/stages/filter_detection.hpp b/morpheus/_lib/include/morpheus/stages/filter_detection.hpp index 930a30dac7..092a7c37de 100644 --- a/morpheus/_lib/include/morpheus/stages/filter_detection.hpp +++ b/morpheus/_lib/include/morpheus/stages/filter_detection.hpp @@ -22,14 +22,8 @@ #include "morpheus/objects/filter_source.hpp" #include -#include -#include -#include -#include -#include #include #include -#include #include #include @@ -38,7 +32,6 @@ #include #include #include -#include namespace morpheus { /****** Component public implementations *******************/ diff --git a/morpheus/_lib/include/morpheus/stages/http_server_source_stage.hpp b/morpheus/_lib/include/morpheus/stages/http_server_source_stage.hpp index 0f6306e714..c311b6aae6 100644 --- a/morpheus/_lib/include/morpheus/stages/http_server_source_stage.hpp +++ b/morpheus/_lib/include/morpheus/stages/http_server_source_stage.hpp @@ -22,25 +22,18 @@ #include // for buffered_channel #include // for context -#include -#include // for table_with_metadata -#include // for RxSinkBase -#include // for RxSourceBase -#include // for channel::Status, SourceProperties<>::source_type_t -#include // for segment::Builder -#include // for segment::Object -#include // for SegmentAddress -#include // for PythonSource -#include // for subscriber +#include // for table_with_metadata +#include // for segment::Builder +#include // for segment::Object +#include // for PythonSource +#include // for subscriber #include // for duration #include // for size_t #include // for int64_t -#include -#include // for shared_ptr & unique_ptr -#include // for std::milli -#include // for string & to_string -#include +#include // for shared_ptr & unique_ptr +#include // for std::milli +#include // for string & to_string // IWYU thinks we're using thread::operator<< // IWYU pragma: no_include diff --git a/morpheus/_lib/include/morpheus/stages/kafka_source.hpp b/morpheus/_lib/include/morpheus/stages/kafka_source.hpp index a879473c3a..909c9c7527 100644 --- a/morpheus/_lib/include/morpheus/stages/kafka_source.hpp +++ b/morpheus/_lib/include/morpheus/stages/kafka_source.hpp @@ -21,15 +21,10 @@ #include "morpheus/types.hpp" #include -#include #include #include -#include -#include -#include #include #include -#include #include #include #include // for apply, make_subscriber, observable_member, is_on_error<>::not_void, is_on_next_of<>::not_void, trace_activity diff --git a/morpheus/_lib/include/morpheus/stages/preprocess_fil.hpp b/morpheus/_lib/include/morpheus/stages/preprocess_fil.hpp index 683badf4bb..982ebca09d 100644 --- a/morpheus/_lib/include/morpheus/stages/preprocess_fil.hpp +++ b/morpheus/_lib/include/morpheus/stages/preprocess_fil.hpp @@ -17,6 +17,7 @@ #pragma once +#include "morpheus/messages/control.hpp" #include "morpheus/messages/multi.hpp" #include "morpheus/messages/multi_inference.hpp" #include "morpheus/objects/table_info.hpp" @@ -25,14 +26,15 @@ #include #include #include -#include // for apply, make_subscriber, observable_member, is_on_error<>::not_void, is_on_next_of<>::not_void, from -// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" +#include #include #include #include #include +// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" + namespace morpheus { /****** Component public implementations *******************/ @@ -48,11 +50,11 @@ namespace morpheus { /** * @brief FIL input data for inference */ -class PreprocessFILStage - : public mrc::pymrc::PythonNode, std::shared_ptr> +template +class PreprocessFILStage : public mrc::pymrc::PythonNode, std::shared_ptr> { public: - using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; + using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; using typename base_t::sink_type_t; using typename base_t::source_type_t; using typename base_t::subscribe_fn_t; @@ -64,35 +66,54 @@ class PreprocessFILStage */ PreprocessFILStage(const std::vector& features); - private: /** - * TODO(Documentation) + * Called every time a message is passed to this stage */ - subscribe_fn_t build_operator(); + source_type_t on_data(sink_type_t x); + private: + std::shared_ptr on_multi_message(std::shared_ptr x); + std::shared_ptr on_control_message(std::shared_ptr x); + void transform_bad_columns(std::vector& fea_cols, morpheus::MutableTableInfo& mutable_info); TableInfo fix_bad_columns(sink_type_t x); std::vector m_fea_cols; std::string m_vocab_file; }; +using PreprocessFILStageMM = // NOLINT(readability-identifier-naming) + PreprocessFILStage; +using PreprocessFILStageCM = // NOLINT(readability-identifier-naming) + PreprocessFILStage; + /****** PreprocessFILStageInferenceProxy********************/ /** * @brief Interface proxy, used to insulate python bindings. */ struct PreprocessFILStageInterfaceProxy { + /** + * @brief Create and initialize a PreprocessFILStage that receives MultiMessage and emits MultiInferenceMessage, + * and return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param features : Reference to the features that are required for model inference + * @return std::shared_ptr>> + */ + static std::shared_ptr>> init_multi( + mrc::segment::Builder& builder, const std::string& name, const std::vector& features); + /** * @brief Create and initialize a PreprocessFILStage, and return the result * * @param builder : Pipeline context object reference * @param name : Name of a stage reference * @param features : Reference to the features that are required for model inference - * @return std::shared_ptr> + * @return std::shared_ptr>> */ - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - const std::vector& features); + static std::shared_ptr>> init_cm( + mrc::segment::Builder& builder, const std::string& name, const std::vector& features); }; #pragma GCC visibility pop /** @} */ // end of group diff --git a/morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp b/morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp index ea330fb330..c6c03f7311 100644 --- a/morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp +++ b/morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp @@ -17,28 +17,25 @@ #pragma once -#include "morpheus/messages/multi.hpp" -#include "morpheus/messages/multi_inference.hpp" +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/multi.hpp" // for MultiMessage +#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // for apply, make_subscriber, observable_member, is_on_error<>::not_void, is_on_next_of<>::not_void, from -// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" +#include // for operator<< +#include // for strings_column_view +#include // for Builder +#include // for Object +#include // for tokenizer_result +#include // for PythonNode +#include // for device_memory_resource +#include // for observable_member, trace_activity, decay_t #include // for uint32_t -#include -#include -#include -#include -#include +#include // for shared_ptr, allocator +#include // for string +#include // for operator<< + +// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" namespace morpheus { /****** Component public implementations *******************/ @@ -54,11 +51,11 @@ namespace morpheus { /** * @brief NLP input data for inference */ -class PreprocessNLPStage - : public mrc::pymrc::PythonNode, std::shared_ptr> +template +class PreprocessNLPStage : public mrc::pymrc::PythonNode, std::shared_ptr> { public: - using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; + using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; using typename base_t::sink_type_t; using typename base_t::source_type_t; using typename base_t::subscribe_fn_t; @@ -89,12 +86,21 @@ class PreprocessNLPStage int stride = -1, std::string column = "data"); - private: /** - * TODO(Documentation) + * Called every time a message is passed to this stage */ - subscribe_fn_t build_operator(); + source_type_t on_data(sink_type_t x); + private: + std::shared_ptr on_multi_message(std::shared_ptr x); + std::shared_ptr on_control_message(std::shared_ptr x); + nvtext::tokenizer_result subword_tokenize(const std::string& vocab_hash_file, + uint32_t sequence_length, + bool do_lower_case, + bool truncation, + cudf::strings_column_view const& string_col, + int stride, + rmm::mr::device_memory_resource* mr); std::string m_vocab_hash_file; std::string m_column; uint32_t m_sequence_length; @@ -104,6 +110,11 @@ class PreprocessNLPStage int m_stride{-1}; }; +using PreprocessNLPStageMM = // NOLINT(readability-identifier-naming) + PreprocessNLPStage; +using PreprocessNLPStageCM = // NOLINT(readability-identifier-naming) + PreprocessNLPStage; + /****** PreprocessNLPStageInferenceProxy********************/ /** * @brief Interface proxy, used to insulate python bindings. @@ -111,7 +122,40 @@ class PreprocessNLPStage struct PreprocessNLPStageInterfaceProxy { /** - * @brief Create and initialize a ProcessNLPStage, and return the result + * @brief Create and initialize a ProcessNLPStage that receives MultiMessage and emits MultiInferenceMessage, and + * return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param vocab_hash_file : Path to hash file containing vocabulary of words with token-ids. This can be created + * from the raw vocabulary using the `cudf.utils.hash_vocab_utils.hash_vocab` function. + * @param sequence_length : Sequence Length to use (We add to special tokens for NER classification job). + * @param truncation : If set to true, strings will be truncated and padded to max_length. Each input string will + * result in exactly one output sequence. If set to false, there may be multiple output sequences when the + * max_length is smaller than generated tokens. + * @param do_lower_case : If set to true, original text will be lowercased before encoding. + * @param add_special_token : Whether or not to encode the sequences with the special tokens of the BERT + * classification model. + * @param stride : If `truncation` == False and the tokenized string is larger than max_length, the sequences + * containing the overflowing token-ids can contain duplicated token-ids from the main sequence. If max_length is + * equal to stride there are no duplicated-id tokens. If stride is 80% of max_length, 20% of the first sequence will + * be repeated on the second sequence and so on until the entire sentence is encoded. + * @param column : Name of the string column to operate on, defaults to "data". + * @return std::shared_ptr>> + */ + static std::shared_ptr>> init_multi( + mrc::segment::Builder& builder, + const std::string& name, + std::string vocab_hash_file, + uint32_t sequence_length, + bool truncation, + bool do_lower_case, + bool add_special_token, + int stride = -1, + std::string column = "data"); + /** + * @brief Create and initialize a ProcessNLPStage that receives ControlMessage and emits ControlMessage, and return + * the result * * @param builder : Pipeline context object reference * @param name : Name of a stage reference @@ -129,18 +173,20 @@ struct PreprocessNLPStageInterfaceProxy * equal to stride there are no duplicated-id tokens. If stride is 80% of max_length, 20% of the first sequence will * be repeated on the second sequence and so on until the entire sentence is encoded. * @param column : Name of the string column to operate on, defaults to "data". - * @return std::shared_ptr> + * @return std::shared_ptr>> */ - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - std::string vocab_hash_file, - uint32_t sequence_length, - bool truncation, - bool do_lower_case, - bool add_special_token, - int stride = -1, - std::string column = "data"); + static std::shared_ptr>> init_cm( + mrc::segment::Builder& builder, + const std::string& name, + std::string vocab_hash_file, + uint32_t sequence_length, + bool truncation, + bool do_lower_case, + bool add_special_token, + int stride = -1, + std::string column = "data"); }; + #pragma GCC visibility pop /** @} */ // end of group } // namespace morpheus diff --git a/morpheus/_lib/include/morpheus/stages/serialize.hpp b/morpheus/_lib/include/morpheus/stages/serialize.hpp index 44b4278cac..36921feeb6 100644 --- a/morpheus/_lib/include/morpheus/stages/serialize.hpp +++ b/morpheus/_lib/include/morpheus/stages/serialize.hpp @@ -17,29 +17,24 @@ #pragma once +#include "morpheus/messages/control.hpp" #include "morpheus/messages/meta.hpp" // for MessageMeta #include "morpheus/messages/multi.hpp" #include -#include -#include -#include -#include -#include #include #include -#include #include #include // for apply, make_subscriber, observable_member, is_on_error<>::not_void, is_on_next_of<>::not_void, from -// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" -#include #include #include #include #include #include // for vector +// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" + namespace morpheus { /****** Component public implementations *******************/ /****** SerializeStage********************************/ @@ -55,10 +50,11 @@ namespace morpheus { * @brief Include & exclude columns from messages. This class filters columns from a `MultiMessage` object emitting a * `MessageMeta`. */ -class SerializeStage : public mrc::pymrc::PythonNode, std::shared_ptr> +template +class SerializeStage : public mrc::pymrc::PythonNode, std::shared_ptr> { public: - using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; + using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; using typename base_t::sink_type_t; using typename base_t::source_type_t; using typename base_t::subscribe_fn_t; @@ -94,6 +90,9 @@ class SerializeStage : public mrc::pymrc::PythonNode m_column_names; }; +using SerializeStageMM = SerializeStage; // NOLINT(readability-identifier-naming) +using SerializeStageCM = SerializeStage; // NOLINT(readability-identifier-naming) + /****** WriteToFileStageInterfaceProxy******************/ /** * @brief Interface proxy, used to insulate python bindings. @@ -111,11 +110,28 @@ struct SerializeStageInterfaceProxy * the same columns as the first message received. * @return std::shared_ptr> */ - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - const std::vector& include, - const std::vector& exclude, - bool fixed_columns = true); + static std::shared_ptr> init_mm(mrc::segment::Builder& builder, + const std::string& name, + const std::vector& include, + const std::vector& exclude, + bool fixed_columns = true); + + /** + * @brief Create and initialize a SerializeStage, and return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param include : Reference to the attributes that are required send to downstream stage. + * @param exclude : Reference to the attributes that are not required send to downstream stage. + * @param fixed_columns : When `True` `SerializeStage` will assume that the Dataframe in all messages contain + * the same columns as the first message received. + * @return std::shared_ptr> + */ + static std::shared_ptr> init_cm(mrc::segment::Builder& builder, + const std::string& name, + const std::vector& include, + const std::vector& exclude, + bool fixed_columns = true); }; #pragma GCC visibility pop diff --git a/morpheus/_lib/include/morpheus/stages/write_to_file.hpp b/morpheus/_lib/include/morpheus/stages/write_to_file.hpp index 8e7d32c427..8efb212488 100644 --- a/morpheus/_lib/include/morpheus/stages/write_to_file.hpp +++ b/morpheus/_lib/include/morpheus/stages/write_to_file.hpp @@ -21,24 +21,16 @@ #include "morpheus/objects/file_types.hpp" #include -#include -#include -#include -#include -#include #include #include -#include #include #include #include #include // for function -#include #include #include #include -#include namespace morpheus { /****** Component public implementations *******************/ diff --git a/morpheus/_lib/include/morpheus/utilities/http_server.hpp b/morpheus/_lib/include/morpheus/utilities/http_server.hpp index f598f5b277..89117cbab9 100644 --- a/morpheus/_lib/include/morpheus/utilities/http_server.hpp +++ b/morpheus/_lib/include/morpheus/utilities/http_server.hpp @@ -17,12 +17,12 @@ #pragma once -#include // for io_context -#include // for tcp, tcp::acceptor, tcp::endpoint, tcp::socket -#include // for error_code -#include // for verb -#include // for error_code -#include // for pybind11::function +#include // for io_context +#include // for tcp, tcp::acceptor, tcp::endpoint, tcp::socket +#include // for error_code +#include // for verb +#include +#include // for pybind11::function #include // for atomic #include // for seconds @@ -46,6 +46,7 @@ namespace morpheus { #pragma GCC visibility push(default) class Listener; + using on_complete_cb_fn_t = std::function; /** diff --git a/morpheus/_lib/llm/module.cpp b/morpheus/_lib/llm/module.cpp index a356d6b504..8a23f559dc 100644 --- a/morpheus/_lib/llm/module.cpp +++ b/morpheus/_lib/llm/module.cpp @@ -37,11 +37,11 @@ #include // for Object, ObjectProperties #include -#include -#include // IWYU pragma: keep -#include // for arg, init, class_, module_, str_attr_accessor, PYBIND11_MODULE, pybind11 -#include // IWYU pragma: keep -#include // IWYU pragma: keep +#include // for nlohmann::detail::out_of_range +#include // IWYU pragma: keep +#include // for arg, init, class_, module_, str_attr_accessor, PYBIND11_MODULE, pybind11 +#include // IWYU pragma: keep +#include // IWYU pragma: keep #include // for JSONValues #include // for pymrc::import diff --git a/morpheus/_lib/llm/src/py_llm_node.cpp b/morpheus/_lib/llm/src/py_llm_node.cpp index 2124d8f817..8daa839ac5 100644 --- a/morpheus/_lib/llm/src/py_llm_node.cpp +++ b/morpheus/_lib/llm/src/py_llm_node.cpp @@ -19,7 +19,6 @@ #include "morpheus/llm/input_map.hpp" #include "morpheus/llm/llm_engine.hpp" -#include "morpheus/llm/llm_node_base.hpp" #include // IWYU pragma: keep #include diff --git a/morpheus/_lib/llm/src/py_llm_node_base.cpp b/morpheus/_lib/llm/src/py_llm_node_base.cpp index aea311ad9b..8d42dd5388 100644 --- a/morpheus/_lib/llm/src/py_llm_node_base.cpp +++ b/morpheus/_lib/llm/src/py_llm_node_base.cpp @@ -17,7 +17,7 @@ #include "py_llm_node_base.hpp" -#include "morpheus/llm/llm_context.hpp" +#include "morpheus/llm/llm_context.hpp" // IWYU pragma: keep #include "morpheus/llm/llm_engine.hpp" #include "morpheus/llm/llm_node.hpp" #include "morpheus/llm/llm_node_base.hpp" diff --git a/morpheus/_lib/messages/__init__.pyi b/morpheus/_lib/messages/__init__.pyi index 67077ef8eb..f94113fa7b 100644 --- a/morpheus/_lib/messages/__init__.pyi +++ b/morpheus/_lib/messages/__init__.pyi @@ -184,10 +184,19 @@ class MessageMeta(): def copy_dataframe(self) -> object: ... def ensure_sliceable_index(self) -> typing.Optional[str]: ... def get_column_names(self) -> typing.List[str]: ... + @typing.overload + def get_data(self) -> object: ... + @typing.overload + def get_data(self, columns: None) -> object: ... + @typing.overload + def get_data(self, columns: str) -> object: ... + @typing.overload + def get_data(self, columns: typing.List[str]) -> object: ... def has_sliceable_index(self) -> bool: ... @staticmethod def make_from_file(arg0: str) -> MessageMeta: ... def mutable_dataframe(self) -> MutableTableCtxMgr: ... + def set_data(self, arg0: object, arg1: object) -> None: ... @property def count(self) -> int: """ diff --git a/morpheus/_lib/messages/module.cpp b/morpheus/_lib/messages/module.cpp index 453d691082..7132e2192f 100644 --- a/morpheus/_lib/messages/module.cpp +++ b/morpheus/_lib/messages/module.cpp @@ -229,6 +229,22 @@ PYBIND11_MODULE(messages, _module) .def(py::init<>(&MessageMetaInterfaceProxy::init_python), py::arg("df")) .def_property_readonly("count", &MessageMetaInterfaceProxy::count) .def_property_readonly("df", &MessageMetaInterfaceProxy::df_property, py::return_value_policy::move) + .def("get_data", + py::overload_cast(&MessageMetaInterfaceProxy::get_data), + py::return_value_policy::move) + .def("get_data", + py::overload_cast(&MessageMetaInterfaceProxy::get_data), + py::return_value_policy::move, + py::arg("columns")) + .def("get_data", + py::overload_cast>(&MessageMetaInterfaceProxy::get_data), + py::return_value_policy::move, + py::arg("columns")) + .def("get_data", + py::overload_cast(&MessageMetaInterfaceProxy::get_data), + py::return_value_policy::move, + py::arg("columns")) + .def("set_data", &MessageMetaInterfaceProxy::set_data, py::return_value_policy::move) .def("get_column_names", &MessageMetaInterfaceProxy::get_column_names) .def("copy_dataframe", &MessageMetaInterfaceProxy::get_data_frame, py::return_value_policy::move) .def("mutable_dataframe", &MessageMetaInterfaceProxy::mutable_dataframe, py::return_value_policy::move) diff --git a/morpheus/_lib/modules/module.cpp b/morpheus/_lib/modules/module.cpp index db64342c74..1c7dc4811c 100644 --- a/morpheus/_lib/modules/module.cpp +++ b/morpheus/_lib/modules/module.cpp @@ -20,12 +20,9 @@ #include "morpheus/version.hpp" #include -#include -#include // for object_api::operator(), object::cast #include // for arg, init, class_, module_, str_attr_accessor, PYBIND11_MODULE, pybind11 #include -#include // for array #include #include diff --git a/morpheus/_lib/src/io/data_loader_registry.cpp b/morpheus/_lib/src/io/data_loader_registry.cpp index 1655631d7c..c9a61d6e52 100644 --- a/morpheus/_lib/src/io/data_loader_registry.cpp +++ b/morpheus/_lib/src/io/data_loader_registry.cpp @@ -24,14 +24,12 @@ #include #include -#include -#include +#include // for gil_scoped_acquire #include #include -#include #include -#include +#include // for move namespace morpheus { template class FactoryRegistry; diff --git a/morpheus/_lib/src/io/loaders/file.cpp b/morpheus/_lib/src/io/loaders/file.cpp index b60a0a06d8..dab17a20e6 100644 --- a/morpheus/_lib/src/io/loaders/file.cpp +++ b/morpheus/_lib/src/io/loaders/file.cpp @@ -22,14 +22,12 @@ #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -131,4 +129,4 @@ std::shared_ptr FileDataLoader::load(std::shared_ptrpayload(MessageMeta::create_from_python(std::move(dataframe))); return message; } -} // namespace morpheus \ No newline at end of file +} // namespace morpheus diff --git a/morpheus/_lib/src/io/loaders/rest.cpp b/morpheus/_lib/src/io/loaders/rest.cpp index 76d2c87ada..4e2bfe0b00 100644 --- a/morpheus/_lib/src/io/loaders/rest.cpp +++ b/morpheus/_lib/src/io/loaders/rest.cpp @@ -21,16 +21,15 @@ #include "morpheus/messages/meta.hpp" #include -#include +#include #include -#include #include #include -#include #include #include #include -#include +#include +#include #include #include #include @@ -43,18 +42,18 @@ #include #include #include -#include -#include +#include +#include +#include +#include #include #include -#include #include #include #include #include #include -#include #include #include #include diff --git a/morpheus/_lib/src/io/serializers.cpp b/morpheus/_lib/src/io/serializers.cpp index 4c31cf0b7f..54234f1592 100644 --- a/morpheus/_lib/src/io/serializers.cpp +++ b/morpheus/_lib/src/io/serializers.cpp @@ -28,11 +28,9 @@ #include #include #include -#include +#include #include #include // IWYU pragma: keep -#include -#include #include // for size_t #include diff --git a/morpheus/_lib/src/llm/input_map.cpp b/morpheus/_lib/src/llm/input_map.cpp index 9c5e27c921..4f27a3235d 100644 --- a/morpheus/_lib/src/llm/input_map.cpp +++ b/morpheus/_lib/src/llm/input_map.cpp @@ -20,16 +20,9 @@ #include "morpheus/llm/llm_node_runner.hpp" #include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include namespace morpheus::llm { diff --git a/morpheus/_lib/src/llm/llm_node_runner.cpp b/morpheus/_lib/src/llm/llm_node_runner.cpp index 4f6a335ab5..3624eb0b34 100644 --- a/morpheus/_lib/src/llm/llm_node_runner.cpp +++ b/morpheus/_lib/src/llm/llm_node_runner.cpp @@ -18,6 +18,7 @@ #include "morpheus/llm/llm_node_runner.hpp" #include "morpheus/llm/llm_context.hpp" +#include "morpheus/llm/llm_node_base.hpp" #include "morpheus/llm/utils.hpp" #include "morpheus/utilities/string_util.hpp" diff --git a/morpheus/_lib/src/llm/utils.cpp b/morpheus/_lib/src/llm/utils.cpp index a10fb63f4c..8addc5c4a8 100644 --- a/morpheus/_lib/src/llm/utils.cpp +++ b/morpheus/_lib/src/llm/utils.cpp @@ -18,7 +18,6 @@ #include "morpheus/llm/utils.hpp" #include "morpheus/llm/input_map.hpp" -#include "morpheus/llm/llm_node_runner.hpp" #include "morpheus/utilities/string_util.hpp" #include @@ -26,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/morpheus/_lib/src/messages/control.cpp b/morpheus/_lib/src/messages/control.cpp index dd54b80a43..0edece274d 100644 --- a/morpheus/_lib/src/messages/control.cpp +++ b/morpheus/_lib/src/messages/control.cpp @@ -25,7 +25,6 @@ #include #include -#include #include #include #include @@ -33,6 +32,7 @@ #include namespace py = pybind11; +using namespace py::literals; namespace morpheus { diff --git a/morpheus/_lib/src/messages/memory/inference_memory.cpp b/morpheus/_lib/src/messages/memory/inference_memory.cpp index 2ad969642a..cc434ed9ca 100644 --- a/morpheus/_lib/src/messages/memory/inference_memory.cpp +++ b/morpheus/_lib/src/messages/memory/inference_memory.cpp @@ -17,11 +17,10 @@ #include "morpheus/messages/memory/inference_memory.hpp" -// for TensorObject #include "morpheus/objects/tensor_object.hpp" // IWYU pragma: keep #include "morpheus/utilities/cupy_util.hpp" // for CupyUtil::cupy_to_tensors, CupyUtil::py_tensor_map_t -#include +#include #include // IWYU pragma: keep #include diff --git a/morpheus/_lib/src/messages/memory/response_memory.cpp b/morpheus/_lib/src/messages/memory/response_memory.cpp index 7d2512e5d3..2949b7c8f0 100644 --- a/morpheus/_lib/src/messages/memory/response_memory.cpp +++ b/morpheus/_lib/src/messages/memory/response_memory.cpp @@ -19,7 +19,7 @@ #include "morpheus/utilities/cupy_util.hpp" -#include +#include #include // IWYU pragma: keep #include diff --git a/morpheus/_lib/src/messages/memory/tensor_memory.cpp b/morpheus/_lib/src/messages/memory/tensor_memory.cpp index 4f1c734516..f3da72e487 100644 --- a/morpheus/_lib/src/messages/memory/tensor_memory.cpp +++ b/morpheus/_lib/src/messages/memory/tensor_memory.cpp @@ -22,7 +22,7 @@ #include "morpheus/utilities/stage_util.hpp" #include "morpheus/utilities/string_util.hpp" // for MORPHEUS_CONCAT_STR -#include +#include #include // for attribute_error, key_error #include // IWYU pragma: keep diff --git a/morpheus/_lib/src/messages/meta.cpp b/morpheus/_lib/src/messages/meta.cpp index eedce67439..dfb8dfbd47 100644 --- a/morpheus/_lib/src/messages/meta.cpp +++ b/morpheus/_lib/src/messages/meta.cpp @@ -18,23 +18,32 @@ #include "morpheus/messages/meta.hpp" #include "morpheus/io/deserializers.hpp" +#include "morpheus/objects/dtype.hpp" // for DType #include "morpheus/objects/mutable_table_ctx_mgr.hpp" #include "morpheus/objects/python_data_table.hpp" #include "morpheus/objects/table_info.hpp" +#include "morpheus/objects/tensor_object.hpp" #include "morpheus/utilities/cudf_util.hpp" +#include // for cudaMemcpy, cudaMemcpy2D, cudaMemcpyKind +#include // for column_view #include +#include // for type_id, data_type, size_type #include +#include // for __check_cuda_errors, MRC_CHECK_CUDA #include #include #include #include // for PyExc_DeprecationWarning #include // for PyErr_WarnEx +#include // for size_t +#include // for uint8_t #include #include #include // for operator<< needed by glog #include // for runtime_error +#include // for make_tuple, tuple #include // We're already including pybind11.h and don't need to include cast. // For some reason IWYU also thinks we need array for the `isinsance` call. @@ -44,6 +53,7 @@ namespace morpheus { namespace py = pybind11; +using namespace py::literals; /****** Component public implementations *******************/ /****** MessageMeta ****************************************/ @@ -58,6 +68,77 @@ TableInfo MessageMeta::get_info() const return this->m_data->get_info(); } +TableInfo MessageMeta::get_info(const std::string& col_name) const +{ + auto full_info = this->m_data->get_info(); + + return full_info.get_slice(0, full_info.num_rows(), {col_name}); +} + +TableInfo MessageMeta::get_info(const std::vector& column_names) const +{ + auto full_info = this->m_data->get_info(); + + return full_info.get_slice(0, full_info.num_rows(), column_names); +} + +void MessageMeta::set_data(const std::string& col_name, TensorObject tensor) +{ + this->set_data({col_name}, {tensor}); +} + +void MessageMeta::set_data(const std::vector& column_names, const std::vector& tensors) +{ + CHECK_EQ(column_names.size(), tensors.size()) << "Column names and tensors must be the same size"; + + TableInfo table_meta; + try + { + table_meta = this->get_info(column_names); + } catch (const std::runtime_error& e) + { + std::ostringstream err_msg; + err_msg << e.what() << " Ensure that the stage that needs this column has populated the '_needed_columns' " + << "attribute and that at least one stage in the current segment is using the PreallocatorMixin to " + << "ensure all needed columns have been allocated."; + throw std::runtime_error(err_msg.str()); + } + + for (std::size_t i = 0; i < tensors.size(); ++i) + { + const auto& cv = table_meta.get_column(i); + const auto table_type_id = cv.type().id(); + const auto tensor_type = DType(tensors[i].dtype()); + const auto tensor_type_id = tensor_type.cudf_type_id(); + const auto row_stride = tensors[i].stride(0); + + CHECK(tensors[i].count() == cv.size() && + (table_type_id == tensor_type_id || + (table_type_id == cudf::type_id::BOOL8 && tensor_type_id == cudf::type_id::UINT8))); + + const auto item_size = tensors[i].dtype().item_size(); + + // Dont use cv.data<>() here since that does not account for the size of each element + auto data_start = const_cast(cv.head()) + cv.offset() * item_size; + + if (row_stride == 1) + { + // column major just use cudaMemcpy + MRC_CHECK_CUDA(cudaMemcpy(data_start, tensors[i].data(), tensors[i].bytes(), cudaMemcpyDeviceToDevice)); + } + else + { + MRC_CHECK_CUDA(cudaMemcpy2D(data_start, + item_size, + tensors[i].data(), + row_stride * item_size, + item_size, + cv.size(), + cudaMemcpyDeviceToDevice)); + } + } +} + MutableTableInfo MessageMeta::get_mutable_info() const { return this->m_data->get_mutable_info(); @@ -180,6 +261,145 @@ TensorIndex MessageMetaInterfaceProxy::count(MessageMeta& self) return self.count(); } +pybind11::object MessageMetaInterfaceProxy::get_data(MessageMeta& self) +{ + // Need to release the GIL before calling `get_meta()` + pybind11::gil_scoped_release no_gil; + + // Get the column and convert to cudf + auto info = self.get_info(); + + // Convert to a python datatable. Automatically gets the GIL + return CudfHelper::table_from_table_info(info); +} + +pybind11::object MessageMetaInterfaceProxy::get_data(MessageMeta& self, std::string col_name) +{ + TableInfo info; + + { + // Need to release the GIL before calling `get_meta()` + pybind11::gil_scoped_release no_gil; + + // Get the column and convert to cudf + info = self.get_info(col_name); + } + + auto py_table = CudfHelper::table_from_table_info(info); + + // Now convert it to a series by selecting only the column + return py_table[col_name.c_str()]; +} + +pybind11::object MessageMetaInterfaceProxy::get_data(MessageMeta& self, std::vector columns) +{ + // Need to release the GIL before calling `get_meta()` + pybind11::gil_scoped_release no_gil; + + // Get the column and convert to cudf + auto info = self.get_info(columns); + + // Convert to a python datatable. Automatically gets the GIL + return CudfHelper::table_from_table_info(info); +} + +pybind11::object MessageMetaInterfaceProxy::get_data(MessageMeta& self, pybind11::none none_obj) +{ + // Just offload to the overload without columns. This overload is needed to match the python interface + return MessageMetaInterfaceProxy::get_data(self); +} + +std::tuple get_indexers(MessageMeta& self, + py::object df, + py::object columns, + cudf::size_type num_rows) +{ + auto row_indexer = pybind11::slice(pybind11::int_(0), pybind11::int_(num_rows), pybind11::none()); + + if (columns.is_none()) + { + columns = df.attr("columns").attr("to_list")(); + } + else if (pybind11::isinstance(columns)) + { + // Convert a single string into a list so all versions return tables, not series + pybind11::list col_list; + + col_list.append(columns); + + columns = std::move(col_list); + } + + auto column_indexer = df.attr("columns").attr("get_indexer_for")(columns); + + return std::make_tuple(row_indexer, column_indexer); +} + +void MessageMetaInterfaceProxy::set_data(MessageMeta& self, pybind11::object columns, pybind11::object value) +{ + // Need to release the GIL before calling `get_meta()` + pybind11::gil_scoped_release no_gil; + + auto mutable_info = self.get_mutable_info(); + auto num_rows = mutable_info.num_rows(); + + // Need the GIL for the remainder + pybind11::gil_scoped_acquire gil; + + auto pdf = mutable_info.checkout_obj(); + auto& df = *pdf; + + auto [row_indexer, column_indexer] = get_indexers(self, df, columns, num_rows); + + // Check to see if this is adding a column. If so, we need to use .loc instead of .iloc + if (column_indexer.contains(-1)) + { + // cudf is really bad at adding new columns. Need to use loc with a unique and monotonic index + py::object saved_index = df.attr("index"); + + // Check to see if we can use slices + if (!(saved_index.attr("is_unique").cast() && (saved_index.attr("is_monotonic_increasing").cast() || + saved_index.attr("is_monotonic_decreasing").cast()))) + { + df.attr("reset_index")("drop"_a = true, "inplace"_a = true); + } + else + { + // Erase the saved index so we dont reset it + saved_index = py::none(); + } + + // Perform the update via slices + df.attr("loc")[pybind11::make_tuple(df.attr("index")[row_indexer], columns)] = value; + + // Reset the index if we changed it + if (!saved_index.is_none()) + { + df.attr("set_index")(saved_index, "inplace"_a = true); + } + } + else + { + // If we only have one column, convert it to a series (broadcasts work with more types on a series) + if (pybind11::len(column_indexer) == 1) + { + column_indexer = column_indexer.cast()[0]; + } + + try + { + // Use iloc + df.attr("iloc")[pybind11::make_tuple(row_indexer, column_indexer)] = value; + } catch (py::error_already_set) + { + // Try this as a fallback. Works better for strings. See issue #286 + df[columns].attr("iloc")[row_indexer] = value; + } + } + + mutable_info.return_obj(std::move(pdf)); +} + std::vector MessageMetaInterfaceProxy::get_column_names(MessageMeta& self) { pybind11::gil_scoped_release no_gil; diff --git a/morpheus/_lib/src/modules/data_loader_module.cpp b/morpheus/_lib/src/modules/data_loader_module.cpp index 5a4bb37cdc..2abf1edda8 100644 --- a/morpheus/_lib/src/modules/data_loader_module.cpp +++ b/morpheus/_lib/src/modules/data_loader_module.cpp @@ -17,8 +17,6 @@ #include "morpheus/modules/data_loader_module.hpp" -#include "rxcpp/operators/rx-map.hpp" - #include "morpheus/io/data_loader_registry.hpp" #include "morpheus/messages/control.hpp" @@ -26,18 +24,17 @@ #include #include #include +#include #include #include #include // IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" -#include #include #include #include #include #include -#include using namespace mrc::modules; using nlohmann::json; diff --git a/morpheus/_lib/src/objects/memory_descriptor.cpp b/morpheus/_lib/src/objects/memory_descriptor.cpp index dabc0a7132..3329bee6bc 100644 --- a/morpheus/_lib/src/objects/memory_descriptor.cpp +++ b/morpheus/_lib/src/objects/memory_descriptor.cpp @@ -18,7 +18,6 @@ #include "morpheus/objects/memory_descriptor.hpp" #include -#include // for get_current_device_resource #include // for move diff --git a/morpheus/_lib/src/objects/python_data_table.cpp b/morpheus/_lib/src/objects/python_data_table.cpp index a6063ebf7f..478aa1f284 100644 --- a/morpheus/_lib/src/objects/python_data_table.cpp +++ b/morpheus/_lib/src/objects/python_data_table.cpp @@ -20,11 +20,9 @@ #include "morpheus/utilities/cudf_util.hpp" #include -#include // for object::cast #include #include -#include #include namespace morpheus { diff --git a/morpheus/_lib/src/objects/wrapped_tensor.cpp b/morpheus/_lib/src/objects/wrapped_tensor.cpp index c1ca21ed2e..b593cc6c97 100644 --- a/morpheus/_lib/src/objects/wrapped_tensor.cpp +++ b/morpheus/_lib/src/objects/wrapped_tensor.cpp @@ -18,13 +18,11 @@ #include "morpheus/objects/wrapped_tensor.hpp" #include "morpheus/objects/tensor_object.hpp" // for TensorObject -#include "morpheus/types.hpp" // for ShapeType #include "morpheus/utilities/cupy_util.hpp" -#include +#include #include -#include // needed for make_tuple #include // for uintptr_t #include #include // get_shape & get_stride return vectors diff --git a/morpheus/_lib/src/stages/add_classification.cpp b/morpheus/_lib/src/stages/add_classification.cpp index 4ea37432f3..7bdb5e2eec 100644 --- a/morpheus/_lib/src/stages/add_classification.cpp +++ b/morpheus/_lib/src/stages/add_classification.cpp @@ -20,6 +20,8 @@ #include "mrc/segment/builder.hpp" #include "mrc/segment/object.hpp" +#include "morpheus/messages/control.hpp" + #include #include #include @@ -31,18 +33,32 @@ namespace morpheus { // Component public implementations // ************ AddClassificationStage **************************** // -AddClassificationsStage::AddClassificationsStage(std::map idx2label, float threshold) : - AddScoresStageBase(std::move(idx2label), threshold) +template +AddClassificationsStage::AddClassificationsStage(std::map idx2label, + float threshold) : + AddScoresStageBase(std::move(idx2label), threshold) {} +template class AddClassificationsStage; +template class AddClassificationsStage; + // ************ AddClassificationStageInterfaceProxy ************* // -std::shared_ptr> AddClassificationStageInterfaceProxy::init( +std::shared_ptr> AddClassificationStageInterfaceProxy::init_multi( + mrc::segment::Builder& builder, + const std::string& name, + std::map idx2label, + float threshold) +{ + return builder.construct_object(name, idx2label, threshold); +} + +std::shared_ptr> AddClassificationStageInterfaceProxy::init_cm( mrc::segment::Builder& builder, const std::string& name, std::map idx2label, float threshold) { - return builder.construct_object(name, idx2label, threshold); + return builder.construct_object(name, idx2label, threshold); } } // namespace morpheus diff --git a/morpheus/_lib/src/stages/add_scores.cpp b/morpheus/_lib/src/stages/add_scores.cpp index dba722ee55..bd5eb69b19 100644 --- a/morpheus/_lib/src/stages/add_scores.cpp +++ b/morpheus/_lib/src/stages/add_scores.cpp @@ -20,6 +20,7 @@ #include "mrc/segment/builder.hpp" #include "mrc/segment/object.hpp" +#include "morpheus/messages/control.hpp" #include "morpheus/stages/add_scores_stage_base.hpp" #include // for size_t @@ -34,15 +35,25 @@ namespace morpheus { // Component public implementations // ************ AddScoresStage **************************** // -AddScoresStage::AddScoresStage(std::map idx2label) : - AddScoresStageBase(std::move(idx2label), std::nullopt) +template +AddScoresStage::AddScoresStage(std::map idx2label) : + AddScoresStageBase(std::move(idx2label), std::nullopt) {} +template class AddScoresStage; +template class AddScoresStage; + // ************ AddScoresStageInterfaceProxy ************* // -std::shared_ptr> AddScoresStageInterfaceProxy::init( +std::shared_ptr> AddScoresStageInterfaceProxy::init_multi( + mrc::segment::Builder& builder, const std::string& name, std::map idx2label) +{ + return builder.construct_object(name, std::move(idx2label)); +} + +std::shared_ptr> AddScoresStageInterfaceProxy::init_cm( mrc::segment::Builder& builder, const std::string& name, std::map idx2label) { - return builder.construct_object(name, std::move(idx2label)); + return builder.construct_object(name, std::move(idx2label)); } } // namespace morpheus diff --git a/morpheus/_lib/src/stages/add_scores_stage_base.cpp b/morpheus/_lib/src/stages/add_scores_stage_base.cpp index cb69f8e6b8..b7ff58ca67 100644 --- a/morpheus/_lib/src/stages/add_scores_stage_base.cpp +++ b/morpheus/_lib/src/stages/add_scores_stage_base.cpp @@ -17,30 +17,29 @@ #include "morpheus/stages/add_scores_stage_base.hpp" -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/sink_properties.hpp" -#include "mrc/node/source_properties.hpp" -#include "mrc/types.hpp" -#include "pymrc/node.hpp" -#include "rxcpp/operators/rx-map.hpp" - -#include "morpheus/objects/dtype.hpp" // for DType -#include "morpheus/objects/tensor.hpp" -#include "morpheus/objects/tensor_object.hpp" // for TensorObject -#include "morpheus/types.hpp" // for TensorIndex -#include "morpheus/utilities/matx_util.hpp" -#include "morpheus/utilities/string_util.hpp" -#include "morpheus/utilities/tensor_util.hpp" // for TensorUtils::get_element_stride - -#include -#include - -#include -#include -#include -#include // needed for logging -#include // for move +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" +#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/objects/dtype.hpp" // for DType +#include "morpheus/objects/tensor.hpp" // for Tensor +#include "morpheus/objects/tensor_object.hpp" // for TensorObject +#include "morpheus/types.hpp" // for TensorIndex +#include "morpheus/utilities/matx_util.hpp" // for MatxUtil +#include "morpheus/utilities/string_util.hpp" // for StringUtil +#include "morpheus/utilities/tensor_util.hpp" // for TensorUtils + +#include // for CHECK, COMPACT_GOOGLE_LOG_FATAL, LogMessageFatal, COMP... +#include // for observable_member, trace_activity, decay_t, operator| + +#include // for size_t +#include // for reverse_iterator +#include // for shared_ptr, allocator, __shared_ptr_access +#include // for basic_ostream, operator<<, basic_ostream::operator<< +#include // for runtime_error +#include // for is_same_v +#include // for type_info +#include // for move, pair +#include // for vector // IWYU thinks we need __alloc_traits<>::value_type for vector assignments // IWYU pragma: no_include // IWYU pragma: no_include @@ -49,18 +48,46 @@ namespace morpheus { // Component public implementations // ************ AddClassificationStage **************************** // -AddScoresStageBase::AddScoresStageBase(std::map idx2label, std::optional threshold) : - PythonNode(), +template +AddScoresStageBase::AddScoresStageBase(std::map idx2label, + std::optional threshold) : + base_t(), m_idx2label(std::move(idx2label)), m_threshold(threshold), m_min_col_count(m_idx2label.rbegin()->first) // Ordered map's largest key will be the last entry { - this->pipe(rxcpp::operators::map([this](sink_type_t x) { return this->on_data(std::move(x)); })); + this->pipe(rxcpp::operators::map([this](sink_type_t x) { + return this->on_data(std::move(x)); + })); } -AddScoresStageBase::source_type_t AddScoresStageBase::on_data(sink_type_t x) +template +AddScoresStageBase::source_type_t AddScoresStageBase::on_data(sink_type_t x) { - auto probs = x->get_probs_tensor(); + if constexpr (std::is_same_v>) + { + this->on_multi_response_message(x); + } + else if constexpr (std::is_same_v>) + { + this->on_control_message(x); + } + // sink_type_t not supported + else + { + std::string error_msg{"AddScoresStageBase receives unsupported input type: " + std::string(typeid(x).name())}; + LOG(ERROR) << error_msg; + throw std::runtime_error(error_msg); + } + return x; +} + +template <> +void AddScoresStageBase::on_multi_response_message( + std::shared_ptr x) +{ + auto probs = x->get_probs_tensor(); + const auto& shape = probs.get_shape(); // Depending on the input the stride is given in bytes or elements, convert to elements @@ -104,8 +131,59 @@ AddScoresStageBase::source_type_t AddScoresStageBase::on_data(sink_type_t x) } x->set_meta(columns, tensors); +} - return x; +template <> +void AddScoresStageBase::on_control_message(std::shared_ptr x) +{ + // The default of probs_tensor_name is "probs" + auto probs = x->tensors()->get_tensor("probs"); + const auto& shape = probs.get_shape(); + + // Depending on the input the stride is given in bytes or elements, convert to elements + auto stride = TensorUtils::get_element_stride(probs.get_stride()); + + CHECK(shape.size() == 2 && shape[1] > m_min_col_count) + << "Model output did not contain enough columns to fufill the requested labels. Label " + "indexes: " + << StringUtil::map_to_str(m_idx2label.begin(), m_idx2label.end()) << ", Model output columns: " << shape[1]; + + const auto num_rows = shape[0]; + const auto num_columns = shape[1]; + + TensorObject output_tensor; + + if (m_threshold.has_value()) + { + auto thresh_bool_buffer = MatxUtil::threshold( + {probs.data(), probs.dtype(), probs.get_memory(), probs.get_shape(), probs.get_stride()}, + *m_threshold, + false); + + output_tensor.swap(Tensor::create(thresh_bool_buffer, DType::create(), shape, stride)); + } + else + { + output_tensor.swap(std::move(probs)); + } + + std::vector columns; + std::vector tensors; + + std::size_t i = 0; + for (const auto& [column_num, column_name] : m_idx2label) + { + columns.push_back(column_name); + tensors.emplace_back(output_tensor.slice({0, static_cast(column_num)}, + {num_rows, static_cast(column_num + 1)})); + + ++i; + } + + x->payload()->set_data(columns, tensors); } +template class AddScoresStageBase; +template class AddScoresStageBase; + } // namespace morpheus diff --git a/morpheus/_lib/src/stages/filter_detection.cpp b/morpheus/_lib/src/stages/filter_detection.cpp index 8cad99f82d..199d716e5b 100644 --- a/morpheus/_lib/src/stages/filter_detection.cpp +++ b/morpheus/_lib/src/stages/filter_detection.cpp @@ -17,13 +17,8 @@ #include "morpheus/stages/filter_detection.hpp" // IWYU pragma: accosiated -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/sink_properties.hpp" -#include "mrc/node/source_properties.hpp" #include "mrc/segment/builder.hpp" #include "mrc/segment/object.hpp" -#include "mrc/types.hpp" #include "pymrc/node.hpp" #include "morpheus/messages/multi_tensor.hpp" @@ -53,6 +48,7 @@ #include // needed for glog #include #include // for pair +#include // IWYU thinks we need ext/new_allocator.h for size_t for some reason // IWYU pragma: no_include diff --git a/morpheus/_lib/src/stages/http_server_source_stage.cpp b/morpheus/_lib/src/stages/http_server_source_stage.cpp index b520497171..65cc0968f8 100644 --- a/morpheus/_lib/src/stages/http_server_source_stage.cpp +++ b/morpheus/_lib/src/stages/http_server_source_stage.cpp @@ -22,13 +22,12 @@ #include // for json_reader_options & read_json #include // for CHECK & LOG -#include // for std::exception -#include // for function -#include // needed by GLOG -#include // for std::runtime_error -#include // for std::this_thread::sleep_for -#include // for make_tuple -#include // for std::move +#include // for std::exception +#include // needed by GLOG +#include // for std::runtime_error +#include // for std::this_thread::sleep_for +#include // for make_tuple +#include // for std::move // IWYU thinks we need more boost headers than we need as int_to_status is defined in status.hpp // IWYU pragma: no_include diff --git a/morpheus/_lib/src/stages/kafka_source.cpp b/morpheus/_lib/src/stages/kafka_source.cpp index a26b01ebe3..1bb6ea369d 100644 --- a/morpheus/_lib/src/stages/kafka_source.cpp +++ b/morpheus/_lib/src/stages/kafka_source.cpp @@ -17,9 +17,6 @@ #include "morpheus/stages/kafka_source.hpp" -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/source_properties.hpp" #include "mrc/segment/object.hpp" #include "pymrc/utilities/function_wrappers.hpp" // for PyFuncWrapper @@ -36,7 +33,7 @@ #include #include // for SharedFuture #include -#include +#include #include #include @@ -46,8 +43,7 @@ #include #include #include -#include // for initializer_list -#include // for back_insert_iterator, back_inserter +#include // for back_insert_iterator, back_inserter #include #include #include @@ -210,8 +206,12 @@ void KafkaSourceStage__Rebalancer::rebalance_cb(RdKafka::KafkaConsumer* consumer std::vector current_assignment; CHECK_KAFKA(consumer->assignment(current_assignment), RdKafka::ERR_NO_ERROR, "Error retrieving current assignment"); - auto old_partition_ids = foreach_map(current_assignment, [](const auto& x) { return x->partition(); }); - auto new_partition_ids = foreach_map(partitions, [](const auto& x) { return x->partition(); }); + auto old_partition_ids = foreach_map(current_assignment, [](const auto& x) { + return x->partition(); + }); + auto new_partition_ids = foreach_map(partitions, [](const auto& x) { + return x->partition(); + }); if (err == RdKafka::ERR__ASSIGN_PARTITIONS) { @@ -334,8 +334,12 @@ KafkaSourceStage::subscriber_fn_t KafkaSourceStage::build() std::size_t records_emitted = 0; // Build rebalancer KafkaSourceStage__Rebalancer rebalancer( - [this]() { return this->batch_timeout_ms(); }, - [this]() { return this->max_batch_size(); }, + [this]() { + return this->batch_timeout_ms(); + }, + [this]() { + return this->max_batch_size(); + }, [this](const std::string str_to_display) { auto& ctx = mrc::runnable::Context::get_runtime_context(); return MORPHEUS_CONCAT_STR(ctx.info() << " " << str_to_display); @@ -552,8 +556,9 @@ std::unique_ptr KafkaSourceStage::create_consumer(RdKafk auto const& parts = *(topic->partitions()); - std::transform( - parts.cbegin(), parts.cend(), std::back_inserter(part_ids), [](auto const& part) { return part->id(); }); + std::transform(parts.cbegin(), parts.cend(), std::back_inserter(part_ids), [](auto const& part) { + return part->id(); + }); auto toppar_list = foreach_map(parts, [&topic](const auto& part) { return std::unique_ptr{ @@ -561,20 +566,24 @@ std::unique_ptr KafkaSourceStage::create_consumer(RdKafk }); std::vector toppar_ptrs = - foreach_map(toppar_list, [](const std::unique_ptr& x) { return x.get(); }); + foreach_map(toppar_list, [](const std::unique_ptr& x) { + return x.get(); + }); // Query Kafka to populate the TopicPartitions with the desired offsets CHECK_KAFKA( consumer->committed(toppar_ptrs, 2000), RdKafka::ERR_NO_ERROR, "Failed retrieve Kafka committed offsets"); - auto committed = - foreach_map(toppar_list, [](const std::unique_ptr& x) { return x->offset(); }); + auto committed = foreach_map(toppar_list, [](const std::unique_ptr& x) { + return x->offset(); + }); // Query Kafka to populate the TopicPartitions with the desired offsets CHECK_KAFKA(consumer->position(toppar_ptrs), RdKafka::ERR_NO_ERROR, "Failed retrieve Kafka positions"); - auto positions = - foreach_map(toppar_list, [](const std::unique_ptr& x) { return x->offset(); }); + auto positions = foreach_map(toppar_list, [](const std::unique_ptr& x) { + return x->offset(); + }); auto watermarks = foreach_map(toppar_list, [&consumer](const std::unique_ptr& x) { int64_t low; diff --git a/morpheus/_lib/src/stages/preprocess_fil.cpp b/morpheus/_lib/src/stages/preprocess_fil.cpp index 293a3af70c..978e7557eb 100644 --- a/morpheus/_lib/src/stages/preprocess_fil.cpp +++ b/morpheus/_lib/src/stages/preprocess_fil.cpp @@ -17,192 +17,297 @@ #include "morpheus/stages/preprocess_fil.hpp" -#include "mrc/segment/object.hpp" - -#include "morpheus/messages/memory/inference_memory_fil.hpp" -#include "morpheus/messages/meta.hpp" // for MessageMeta -#include "morpheus/objects/dev_mem_info.hpp" // for DevMemInfo -#include "morpheus/objects/dtype.hpp" -#include "morpheus/objects/table_info.hpp" // for TableInfo -#include "morpheus/objects/tensor.hpp" -#include "morpheus/objects/tensor_object.hpp" // for TensorObject -#include "morpheus/types.hpp" // for TensorIndex -#include "morpheus/utilities/matx_util.hpp" - -#include // for cudaMemcpy, cudaMemcpyDeviceToDevice -#include // for column, column::contents +#include "mrc/segment/object.hpp" // for Object + +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/inference_memory_fil.hpp" // for InferenceMemoryFIL +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/messages/multi.hpp" // for MultiMessage +#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage +#include "morpheus/objects/dev_mem_info.hpp" // for DevMemInfo +#include "morpheus/objects/dtype.hpp" // for DType, TypeId +#include "morpheus/objects/table_info.hpp" // for TableInfo, MutableTableInfo +#include "morpheus/objects/tensor.hpp" // for Tensor +#include "morpheus/objects/tensor_object.hpp" // for TensorObject +#include "morpheus/types.hpp" // for TensorIndex +#include "morpheus/utilities/matx_util.hpp" // for MatxUtil + +#include // for cudaMemcpy, cudaMemcpyKind +#include // for column #include // for column_view -#include -#include -#include // for MRC_CHECK_CUDA -#include -#include -#include // for str_attr_accessor, arg -#include -#include -#include // for cuda_stream_per_thread -#include // for device_buffer - -#include // for std::find -#include -#include -#include -#include +#include // for type_id, data_type +#include // for cast +#include // for COMPACT_GOOGLE_LOG_ERROR, LOG, LogMessage +#include // for __check_cuda_errors, MRC_CHECK_CUDA +#include // for Builder +#include // for gil_scoped_acquire +#include // for object_api::operator(), operator""_a, arg +#include // for object, str, object_api, generic_item, literals +#include // for cuda_stream_per_thread +#include // for device_buffer + +#include // for find +#include // for size_t +#include // for shared_ptr, __shared_ptr_access, allocator, mak... +#include // for runtime_error +#include // for is_same_v +#include // for type_info +#include // for move namespace morpheus { // Component public implementations // ************ PreprocessFILStage ************************* // -PreprocessFILStage::PreprocessFILStage(const std::vector& features) : - PythonNode(base_t::op_factory_from_sub_fn(build_operator())), +template +PreprocessFILStage::PreprocessFILStage(const std::vector& features) : + base_t(rxcpp::operators::map([this](sink_type_t x) { + return this->on_data(std::move(x)); + })), m_fea_cols(std::move(features)) {} -PreprocessFILStage::subscribe_fn_t PreprocessFILStage::build_operator() +template +void PreprocessFILStage::transform_bad_columns(std::vector& fea_cols, + morpheus::MutableTableInfo& mutable_info) { - return [this](rxcpp::observable input, rxcpp::subscriber output) { - return input.subscribe(rxcpp::make_observer( - [&output, this](sink_type_t x) { - // Make sure to - auto df_meta = this->fix_bad_columns(x); - - auto packed_data = std::make_shared( - m_fea_cols.size() * x->mess_count * sizeof(float), rmm::cuda_stream_per_thread); - - for (size_t i = 0; i < df_meta.num_columns(); ++i) - { - auto curr_col = df_meta.get_column(i); - - auto curr_ptr = static_cast(packed_data->data()) + i * df_meta.num_rows(); - - // Check if we are something other than float - if (curr_col.type().id() != cudf::type_id::FLOAT32) - { - auto float_data = cudf::cast(curr_col, cudf::data_type(cudf::type_id::FLOAT32))->release(); - - // Do the copy here before it goes out of scope - MRC_CHECK_CUDA(cudaMemcpy(curr_ptr, - float_data.data->data(), - df_meta.num_rows() * sizeof(float), - cudaMemcpyDeviceToDevice)); - } - else - { - MRC_CHECK_CUDA(cudaMemcpy(curr_ptr, - curr_col.data(), - df_meta.num_rows() * sizeof(float), - cudaMemcpyDeviceToDevice)); - } - } - - // Need to convert from row major to column major - // Easiest way to do this is to transpose the data from [fea_len, row_count] to [row_count, fea_len] - auto transposed_data = - MatxUtil::transpose(DevMemInfo{packed_data, - TypeId::FLOAT32, - {static_cast(m_fea_cols.size()), x->mess_count}, - {x->mess_count, 1}}); - - // Create the tensor which will be row-major and size [row_count, fea_len] - auto input__0 = Tensor::create(transposed_data, - DType::create(), - {x->mess_count, static_cast(m_fea_cols.size())}, - {}, - 0); - - auto seq_id_dtype = DType::create(); - auto seq_ids = Tensor::create(MatxUtil::create_seq_ids(x->mess_count, - m_fea_cols.size(), - seq_id_dtype.type_id(), - input__0.get_memory(), - x->mess_offset), - seq_id_dtype, - {x->mess_count, 3}, - {}, - 0); - - // Build the results - auto memory = - std::make_shared(x->mess_count, std::move(input__0), std::move(seq_ids)); - - auto next = std::make_shared( - x->meta, x->mess_offset, x->mess_count, std::move(memory), 0, memory->count); - - output.on_next(std::move(next)); - }, - [&](std::exception_ptr error_ptr) { - output.on_error(error_ptr); - }, - [&]() { - output.on_completed(); - })); - }; + auto df_meta_col_names = mutable_info.get_column_names(); + std::vector bad_cols; + // Only check the feature columns. Leave the rest unchanged + for (auto& fea_col : fea_cols) + { + // Find the index of the column in the dataframe + auto col_idx = + std::find(df_meta_col_names.begin(), df_meta_col_names.end(), fea_col) - df_meta_col_names.begin(); + + if (col_idx == df_meta_col_names.size()) + { + // This feature was not found. Ignore it. + continue; + } + + if (mutable_info.get_column(col_idx).type().id() == cudf::type_id::STRING) + { + bad_cols.push_back(fea_col); + } + } + + // Exit early if there is nothing to do + if (!bad_cols.empty()) + { + // Need to ensure all string columns have been converted to numbers. This requires running a + // regex which is too difficult to do from C++ at this time. So grab the GIL, make the + // conversions, and release. This is horribly inefficient, but so is the JSON lines format for + // this workflow + using namespace pybind11::literals; + pybind11::gil_scoped_acquire gil; + + // pybind11::object df = x->meta->get_py_table(); + auto pdf = mutable_info.checkout_obj(); + auto& df = *pdf; + + std::string regex = R"((\d+))"; + + for (auto c : bad_cols) + { + df[pybind11::str(c)] = df[pybind11::str(c)] + .attr("str") + .attr("extract")(pybind11::str(regex), "expand"_a = true) + .attr("astype")(pybind11::str("float32")); + } + + mutable_info.return_obj(std::move(pdf)); + } } -TableInfo PreprocessFILStage::fix_bad_columns(sink_type_t x) +template +TableInfo PreprocessFILStage::fix_bad_columns(sink_type_t x) { - std::vector bad_cols; + if constexpr (std::is_same_v>) + { + { + // Get the mutable info for the entire meta object so we only do this once per dataframe + auto mutable_info = x->meta->get_mutable_info(); + transform_bad_columns(this->m_fea_cols, mutable_info); + } + // Now re-get the meta + return x->get_meta(m_fea_cols); + } + else if constexpr (std::is_same_v>) { - // Get the mutable info for the entire meta object so we only do this once per dataframe - auto mutable_info = x->meta->get_mutable_info(); - auto df_meta_col_names = mutable_info.get_column_names(); + { + // Get the mutable info for the entire meta object so we only do this once per dataframe + auto mutable_info = x->payload()->get_mutable_info(); + transform_bad_columns(this->m_fea_cols, mutable_info); + } - // Only check the feature columns. Leave the rest unchanged - for (auto& fea_col : m_fea_cols) + // Now re-get the meta + return x->payload()->get_info(m_fea_cols); + } + // sink_type_t not supported + else + { + std::string error_msg{"PreProcessFILStage receives unsupported input type: " + std::string(typeid(x).name())}; + LOG(ERROR) << error_msg; + throw std::runtime_error(error_msg); + } +} + +template +PreprocessFILStage::source_type_t PreprocessFILStage::on_data(sink_type_t x) +{ + if constexpr (std::is_same_v>) + { + return on_multi_message(x); + } + else if constexpr (std::is_same_v>) + { + return on_control_message(x); + } + // sink_type_t not supported + else + { + std::string error_msg{"PreProcessFILStage receives unsupported input type: " + std::string(typeid(x).name())}; + LOG(ERROR) << error_msg; + throw std::runtime_error(error_msg); + } +} + +template <> +std::shared_ptr PreprocessFILStage::on_multi_message( + std::shared_ptr x) +{ + auto packed_data = std::make_shared(m_fea_cols.size() * x->mess_count * sizeof(float), + rmm::cuda_stream_per_thread); + auto df_meta = this->fix_bad_columns(x); + for (size_t i = 0; i < df_meta.num_columns(); ++i) + { + auto curr_col = df_meta.get_column(i); + + auto curr_ptr = static_cast(packed_data->data()) + i * df_meta.num_rows(); + + // Check if we are something other than float + if (curr_col.type().id() != cudf::type_id::FLOAT32) { - // Find the index of the column in the dataframe - auto col_idx = - std::find(df_meta_col_names.begin(), df_meta_col_names.end(), fea_col) - df_meta_col_names.begin(); - - if (col_idx == df_meta_col_names.size()) - { - // This feature was not found. Ignore it. - continue; - } - - if (mutable_info.get_column(col_idx).type().id() == cudf::type_id::STRING) - { - bad_cols.push_back(fea_col); - } + auto float_data = cudf::cast(curr_col, cudf::data_type(cudf::type_id::FLOAT32))->release(); + + // Do the copy here before it goes out of scope + MRC_CHECK_CUDA(cudaMemcpy( + curr_ptr, float_data.data->data(), df_meta.num_rows() * sizeof(float), cudaMemcpyDeviceToDevice)); } + else + { + MRC_CHECK_CUDA(cudaMemcpy(curr_ptr, + curr_col.template data(), + df_meta.num_rows() * sizeof(float), + cudaMemcpyDeviceToDevice)); + } + } + + // Need to convert from row major to column major + // Easiest way to do this is to transpose the data from [fea_len, row_count] to [row_count, fea_len] + auto transposed_data = MatxUtil::transpose(DevMemInfo{packed_data, + TypeId::FLOAT32, + {static_cast(m_fea_cols.size()), x->mess_count}, + {x->mess_count, 1}}); + + // Create the tensor which will be row-major and size [row_count, fea_len] + auto input__0 = Tensor::create( + transposed_data, DType::create(), {x->mess_count, static_cast(m_fea_cols.size())}, {}, 0); + + auto seq_id_dtype = DType::create(); + auto seq_ids = Tensor::create( + MatxUtil::create_seq_ids( + x->mess_count, m_fea_cols.size(), seq_id_dtype.type_id(), input__0.get_memory(), x->mess_offset), + seq_id_dtype, + {x->mess_count, 3}, + {}, + 0); + + // Build the results + auto memory = std::make_shared(x->mess_count, std::move(input__0), std::move(seq_ids)); + + auto next = std::make_shared( + x->meta, x->mess_offset, x->mess_count, std::move(memory), 0, memory->count); + + return next; +} - // Exit early if there is nothing to do - if (!bad_cols.empty()) +template <> +std::shared_ptr PreprocessFILStage::on_control_message( + std::shared_ptr x) +{ + auto num_rows = x->payload()->get_info().num_rows(); + auto packed_data = + std::make_shared(m_fea_cols.size() * num_rows * sizeof(float), rmm::cuda_stream_per_thread); + auto df_meta = this->fix_bad_columns(x); + for (size_t i = 0; i < df_meta.num_columns(); ++i) + { + auto curr_col = df_meta.get_column(i); + + auto curr_ptr = static_cast(packed_data->data()) + i * df_meta.num_rows(); + + // Check if we are something other than float + if (curr_col.type().id() != cudf::type_id::FLOAT32) { - // Need to ensure all string columns have been converted to numbers. This requires running a - // regex which is too difficult to do from C++ at this time. So grab the GIL, make the - // conversions, and release. This is horribly inefficient, but so is the JSON lines format for - // this workflow - using namespace pybind11::literals; - pybind11::gil_scoped_acquire gil; - - // pybind11::object df = x->meta->get_py_table(); - auto pdf = mutable_info.checkout_obj(); - auto& df = *pdf; - - std::string regex = R"((\d+))"; - - for (auto c : bad_cols) - { - df[pybind11::str(c)] = df[pybind11::str(c)] - .attr("str") - .attr("extract")(pybind11::str(regex), "expand"_a = true) - .attr("astype")(pybind11::str("float32")); - } - - mutable_info.return_obj(std::move(pdf)); + auto float_data = cudf::cast(curr_col, cudf::data_type(cudf::type_id::FLOAT32))->release(); + + // Do the copy here before it goes out of scope + MRC_CHECK_CUDA(cudaMemcpy( + curr_ptr, float_data.data->data(), df_meta.num_rows() * sizeof(float), cudaMemcpyDeviceToDevice)); + } + else + { + MRC_CHECK_CUDA(cudaMemcpy(curr_ptr, + curr_col.template data(), + df_meta.num_rows() * sizeof(float), + cudaMemcpyDeviceToDevice)); } } - // Now re-get the meta - return x->get_meta(m_fea_cols); + // Need to convert from row major to column major + // Easiest way to do this is to transpose the data from [fea_len, row_count] to [row_count, fea_len] + auto transposed_data = MatxUtil::transpose(DevMemInfo{ + packed_data, TypeId::FLOAT32, {static_cast(m_fea_cols.size()), num_rows}, {num_rows, 1}}); + + // Create the tensor which will be row-major and size [row_count, fea_len] + auto input__0 = Tensor::create( + transposed_data, DType::create(), {num_rows, static_cast(m_fea_cols.size())}, {}, 0); + + auto seq_id_dtype = DType::create(); + auto seq_ids = Tensor::create( + MatxUtil::create_seq_ids(num_rows, m_fea_cols.size(), seq_id_dtype.type_id(), input__0.get_memory(), 0), + seq_id_dtype, + {num_rows, 3}, + {}, + 0); + + // Build the results + auto memory = std::make_shared(num_rows); + memory->set_tensor("input__0", std::move(input__0)); + memory->set_tensor("seq_ids", std::move(seq_ids)); + auto next = x; + next->tensors(memory); + + return next; } +template class PreprocessFILStage; +template class PreprocessFILStage; + // ************ PreprocessFILStageInterfaceProxy *********** // -std::shared_ptr> PreprocessFILStageInterfaceProxy::init( +std::shared_ptr> PreprocessFILStageInterfaceProxy::init_multi( + mrc::segment::Builder& builder, const std::string& name, const std::vector& features) +{ + auto stage = builder.construct_object(name, features); + + return stage; +} + +std::shared_ptr> PreprocessFILStageInterfaceProxy::init_cm( mrc::segment::Builder& builder, const std::string& name, const std::vector& features) { - auto stage = builder.construct_object(name, features); + auto stage = builder.construct_object(name, features); return stage; } diff --git a/morpheus/_lib/src/stages/preprocess_nlp.cpp b/morpheus/_lib/src/stages/preprocess_nlp.cpp index b82830dd44..75fd794103 100644 --- a/morpheus/_lib/src/stages/preprocess_nlp.cpp +++ b/morpheus/_lib/src/stages/preprocess_nlp.cpp @@ -17,184 +17,295 @@ #include "morpheus/stages/preprocess_nlp.hpp" -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/sink_properties.hpp" -#include "mrc/node/source_properties.hpp" -#include "mrc/segment/object.hpp" -#include "mrc/types.hpp" +#include "mrc/segment/object.hpp" // for Object +#include "morpheus/messages/control.hpp" // for ControlMessage #include "morpheus/messages/memory/inference_memory.hpp" // for InferenceMemory -#include "morpheus/messages/multi_inference.hpp" -#include "morpheus/objects/dev_mem_info.hpp" -#include "morpheus/objects/dtype.hpp" -#include "morpheus/objects/table_info.hpp" // for TableInfo -#include "morpheus/objects/tensor.hpp" -#include "morpheus/types.hpp" // for TensorIndex, TensorMap -#include "morpheus/utilities/matx_util.hpp" - -#include // for column, column::contents -#include -#include -#include -#include -#include -#include // for strings_column_view -#include -#include -#include -#include -#include -#include -#include -#include -#include // for device_buffer - -#include -#include -#include -#include -#include -#include +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" +#include "morpheus/messages/multi.hpp" // for MultiMessage +#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage +#include "morpheus/objects/dev_mem_info.hpp" // for DevMemInfo +#include "morpheus/objects/dtype.hpp" // for DType +#include "morpheus/objects/table_info.hpp" // for TableInfo +#include "morpheus/objects/tensor.hpp" // for Tensor +#include "morpheus/types.hpp" // for TensorIndex +#include "morpheus/utilities/matx_util.hpp" // for MatxUtil + +#include // for column +#include // for make_column_from_scalar +#include // for column_view +#include // for sequence +#include // for interleave_columns +#include // for numeric_scalar +#include // for strings_column_view +#include // for table_view +#include // for type_id, data_type +#include // for cast +#include // for COMPACT_GOOGLE_LOG_ERROR, LOG, LogMessage +#include // for Builder +#include // for normalize_spaces +#include // for tokenizer_result, load_vocabulary_file, subword_tok... +#include // for cuda_stream_default +#include // for device_buffer +#include // for get_current_device_resource + +#include // for uint32_t, int32_t +#include // for shared_ptr, unique_ptr, __shared_ptr_access, make_s... +#include // for runtime_error +#include // for is_same_v +#include // for type_info +#include // for move +#include // for vector namespace morpheus { // Component public implementations // ************ PreprocessNLPStage ************************* // -PreprocessNLPStage::PreprocessNLPStage(std::string vocab_hash_file, - uint32_t sequence_length, - bool truncation, - bool do_lower_case, - bool add_special_token, - int stride, - std::string column) : - PythonNode(base_t::op_factory_from_sub_fn(build_operator())), +template +PreprocessNLPStage::PreprocessNLPStage(std::string vocab_hash_file, + uint32_t sequence_length, + bool truncation, + bool do_lower_case, + bool add_special_token, + int stride, + std::string column) : + base_t(rxcpp::operators::map([this](sink_type_t x) { + return this->on_data(std::move(x)); + })), m_vocab_hash_file(std::move(vocab_hash_file)), m_sequence_length(sequence_length), m_truncation(truncation), m_do_lower_case(do_lower_case), m_add_special_token(add_special_token), - m_stride(stride), m_column(std::move(column)) -{} +{ + // Auto calc stride to be 75% of sequence length + if (stride < 0) + { + stride = m_sequence_length / 2; + stride = stride + stride / 2; + } + + m_stride = stride; +} + +template +PreprocessNLPStage::source_type_t PreprocessNLPStage::on_data(sink_type_t x) +{ + if constexpr (std::is_same_v>) + { + return this->on_multi_message(x); + } + else if constexpr (std::is_same_v>) + { + return this->on_control_message(x); + } + // sink_type_t not supported + else + { + std::string error_msg{"PreProcessNLPStage receives unsupported input type: " + std::string(typeid(x).name())}; + LOG(ERROR) << error_msg; + throw std::runtime_error(error_msg); + } +} + +template <> +std::shared_ptr PreprocessNLPStage::on_multi_message( + std::shared_ptr x) +{ + // Convert to string view + auto meta = x->get_meta(this->m_column); + + auto col = meta.get_column(0); + auto string_col = cudf::strings_column_view{col}; + + auto token_results = subword_tokenize(this->m_vocab_hash_file, + this->m_sequence_length, + this->m_do_lower_case, + this->m_truncation, + string_col, + this->m_stride, + rmm::mr::get_current_device_resource()); + + // Build the results + auto memory = std::make_shared(token_results.nrows_tensor); + + TensorIndex length = token_results.tensor_token_ids->size() / token_results.sequence_length; + auto input_ids_released = + cudf::cast(token_results.tensor_token_ids->view(), cudf::data_type(cudf::type_id::INT32))->release(); + + memory->set_tensor("input_ids", + Tensor::create(std::move(input_ids_released.data), + DType::create(), + {length, static_cast(token_results.sequence_length)}, + {}, + 0)); + + length = token_results.tensor_attention_mask->size() / token_results.sequence_length; + auto input_mask_released = + cudf::cast(token_results.tensor_attention_mask->view(), cudf::data_type(cudf::type_id::INT32))->release(); + memory->set_tensor("input_mask", + Tensor::create(std::move(input_mask_released.data), + DType::create(), + {length, static_cast(token_results.sequence_length)}, + {}, + 0)); + + auto tensor_index_dtype = DType::create(); + length = token_results.tensor_metadata->size() / 3; + auto seq_ids_released = + cudf::cast(token_results.tensor_metadata->view(), cudf::data_type(tensor_index_dtype.cudf_type_id())) + ->release(); + + std::shared_ptr seq_ids_data = std::move(seq_ids_released.data); + + if (x->mess_offset > 0) + { + // Add an offset to the seq_ids so the message IDs line up + MatxUtil::offset_seq_ids(DevMemInfo{seq_ids_data, tensor_index_dtype.type_id(), {length, 3}, {1, 3}}, + x->mess_offset); + } + + memory->set_tensor("seq_ids", Tensor::create(seq_ids_data, tensor_index_dtype, {length, 3}, {}, 0)); + + auto next = std::make_shared( + x->meta, x->mess_offset, x->mess_count, std::move(memory), 0, memory->count); + + return std::move(next); +} + +template <> +std::shared_ptr PreprocessNLPStage::on_control_message( + std::shared_ptr x) +{ + // Convert to string view + auto meta = x->payload()->get_info(this->m_column); + + auto col = meta.get_column(0); + auto string_col = cudf::strings_column_view{col}; + + auto token_results = subword_tokenize(this->m_vocab_hash_file, + this->m_sequence_length, + this->m_do_lower_case, + this->m_truncation, + string_col, + this->m_stride, + rmm::mr::get_current_device_resource()); + + // Build the results + auto memory = std::make_shared(token_results.nrows_tensor); + + TensorIndex length = token_results.tensor_token_ids->size() / token_results.sequence_length; + auto input_ids_released = + cudf::cast(token_results.tensor_token_ids->view(), cudf::data_type(cudf::type_id::INT32))->release(); + memory->set_tensor("input_ids", + Tensor::create(std::move(input_ids_released.data), + DType::create(), + {length, static_cast(token_results.sequence_length)}, + {}, + 0)); + + length = token_results.tensor_attention_mask->size() / token_results.sequence_length; + auto input_mask_released = + cudf::cast(token_results.tensor_attention_mask->view(), cudf::data_type(cudf::type_id::INT32))->release(); + memory->set_tensor("input_mask", + Tensor::create(std::move(input_mask_released.data), + DType::create(), + {length, static_cast(token_results.sequence_length)}, + {}, + 0)); -PreprocessNLPStage::subscribe_fn_t PreprocessNLPStage::build_operator() + auto tensor_index_dtype = DType::create(); + length = token_results.tensor_metadata->size() / 3; + auto seq_ids_released = + cudf::cast(token_results.tensor_metadata->view(), cudf::data_type(tensor_index_dtype.cudf_type_id())) + ->release(); + + std::shared_ptr seq_ids_data = std::move(seq_ids_released.data); + + memory->set_tensor("seq_ids", Tensor::create(seq_ids_data, tensor_index_dtype, {length, 3}, {}, 0)); + + auto next = x; + next->tensors(memory); + + return std::move(next); +} + +template +nvtext::tokenizer_result PreprocessNLPStage::subword_tokenize( + const std::string& vocab_hash_file, + uint32_t sequence_length, + bool do_lower_case, + bool truncation, + cudf::strings_column_view const& string_col, + int stride, + rmm::mr::device_memory_resource* mr) { - return [this](rxcpp::observable input, rxcpp::subscriber output) { - uint32_t stride = m_stride; - - // Auto calc stride to be 75% of sequence length - if (stride < 0) - { - stride = m_sequence_length / 2; - stride = stride + stride / 2; - } - - return input.subscribe(rxcpp::make_observer( - [this, &output, stride](sink_type_t x) { - // Convert to string view - auto meta = x->get_meta(this->m_column); - auto col = meta.get_column(0); - auto string_col = cudf::strings_column_view{col}; - - // Create the hashed vocab - thread_local std::unique_ptr vocab = - nvtext::load_vocabulary_file(this->m_vocab_hash_file); - - // remove leading and trailing whitespace - auto normalized_col = nvtext::normalize_spaces(string_col); - auto normalized_col_view = cudf::strings_column_view{normalized_col->view()}; - - // Perform the tokenizer - nvtext::tokenizer_result token_results; - - if (normalized_col_view.chars_size(rmm::cuda_stream_default) > 0) - { - token_results = nvtext::subword_tokenize(normalized_col_view, - *vocab, - this->m_sequence_length, - stride, - this->m_do_lower_case, - this->m_truncation, - rmm::mr::get_current_device_resource()); - } - else - { - // workaround for a situation where the input strings contain either no characters or only - // whitespace - auto zero = cudf::numeric_scalar(0, true, rmm::cuda_stream_default); - auto ids = - cudf::make_column_from_scalar(zero, this->m_sequence_length * normalized_col_view.size()); - auto mask = - cudf::make_column_from_scalar(zero, this->m_sequence_length * normalized_col_view.size()); - auto metadata = [&]() { - auto iota = cudf::sequence(normalized_col_view.size(), zero); - auto zeroes = cudf::make_column_from_scalar(zero, normalized_col_view.size()); - return cudf::interleave_columns(cudf::table_view{ - std::vector{iota->view(), zeroes->view(), zeroes->view()}}); - }(); - - token_results = nvtext::tokenizer_result{static_cast(normalized_col_view.size()), - this->m_sequence_length, - std::move(ids), - std::move(mask), - std::move(metadata)}; - } - - // Build the results - auto memory = std::make_shared(token_results.nrows_tensor); - - TensorIndex length = token_results.tensor_token_ids->size() / token_results.sequence_length; - auto input_ids_released = - cudf::cast(token_results.tensor_token_ids->view(), cudf::data_type(cudf::type_id::INT32)) - ->release(); - - memory->set_tensor("input_ids", - Tensor::create(std::move(input_ids_released.data), - DType::create(), - {length, static_cast(token_results.sequence_length)}, - {}, - 0)); - - length = token_results.tensor_attention_mask->size() / token_results.sequence_length; - auto input_mask_released = - cudf::cast(token_results.tensor_attention_mask->view(), cudf::data_type(cudf::type_id::INT32)) - ->release(); - memory->set_tensor("input_mask", - Tensor::create(std::move(input_mask_released.data), - DType::create(), - {length, static_cast(token_results.sequence_length)}, - {}, - 0)); - - auto tensor_index_dtype = DType::create(); - length = token_results.tensor_metadata->size() / 3; - auto seq_ids_released = cudf::cast(token_results.tensor_metadata->view(), - cudf::data_type(tensor_index_dtype.cudf_type_id())) - ->release(); - - std::shared_ptr seq_ids_data = std::move(seq_ids_released.data); - - if (x->mess_offset > 0) - { - // Add an offset to the seq_ids so the message IDs line up - MatxUtil::offset_seq_ids( - DevMemInfo{seq_ids_data, tensor_index_dtype.type_id(), {length, 3}, {1, 3}}, x->mess_offset); - } - - memory->set_tensor("seq_ids", Tensor::create(seq_ids_data, tensor_index_dtype, {length, 3}, {}, 0)); - - auto next = std::make_shared( - x->meta, x->mess_offset, x->mess_count, std::move(memory), 0, memory->count); - - output.on_next(std::move(next)); - }, - [&](std::exception_ptr error_ptr) { output.on_error(error_ptr); }, - [&]() { output.on_completed(); })); - }; + // Create the hashed vocab + thread_local std::unique_ptr vocab = nvtext::load_vocabulary_file(vocab_hash_file); + + // remove leading and trailing whitespace + auto normalized_col = nvtext::normalize_spaces(string_col); + auto normalized_col_view = cudf::strings_column_view{normalized_col->view()}; + + // Perform the tokenizer + nvtext::tokenizer_result token_results; + + if (normalized_col_view.chars_size(rmm::cuda_stream_default) > 0) + { + token_results = nvtext::subword_tokenize(normalized_col_view, + *vocab, + sequence_length, + stride, + do_lower_case, + truncation, + rmm::mr::get_current_device_resource()); + } + else + { + // workaround for a situation where the input strings contain either no characters or only + // whitespace + auto zero = cudf::numeric_scalar(0, true, rmm::cuda_stream_default); + auto ids = cudf::make_column_from_scalar(zero, sequence_length * normalized_col_view.size()); + auto mask = cudf::make_column_from_scalar(zero, sequence_length * normalized_col_view.size()); + auto metadata = [&]() { + auto iota = cudf::sequence(normalized_col_view.size(), zero); + auto zeroes = cudf::make_column_from_scalar(zero, normalized_col_view.size()); + return cudf::interleave_columns( + cudf::table_view{std::vector{iota->view(), zeroes->view(), zeroes->view()}}); + }(); + + token_results = nvtext::tokenizer_result{static_cast(normalized_col_view.size()), + sequence_length, + std::move(ids), + std::move(mask), + std::move(metadata)}; + } + return token_results; } +template class PreprocessNLPStage; +template class PreprocessNLPStage; + // ************ PreprocessNLPStageInterfaceProxy *********** // -std::shared_ptr> PreprocessNLPStageInterfaceProxy::init( +std::shared_ptr> PreprocessNLPStageInterfaceProxy::init_multi( + mrc::segment::Builder& builder, + const std::string& name, + std::string vocab_hash_file, + uint32_t sequence_length, + bool truncation, + bool do_lower_case, + bool add_special_token, + int stride, + std::string column) +{ + auto stage = builder.construct_object( + name, vocab_hash_file, sequence_length, truncation, do_lower_case, add_special_token, stride, column); + + return stage; +} + +std::shared_ptr> PreprocessNLPStageInterfaceProxy::init_cm( mrc::segment::Builder& builder, const std::string& name, std::string vocab_hash_file, @@ -205,7 +316,7 @@ std::shared_ptr> PreprocessNLPStageInte int stride, std::string column) { - auto stage = builder.construct_object( + auto stage = builder.construct_object( name, vocab_hash_file, sequence_length, truncation, do_lower_case, add_special_token, stride, column); return stage; diff --git a/morpheus/_lib/src/stages/serialize.cpp b/morpheus/_lib/src/stages/serialize.cpp index b725b2fde6..fb612cd0b0 100644 --- a/morpheus/_lib/src/stages/serialize.cpp +++ b/morpheus/_lib/src/stages/serialize.cpp @@ -17,23 +17,18 @@ #include "morpheus/stages/serialize.hpp" -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/sink_properties.hpp" -#include "mrc/node/source_properties.hpp" #include "mrc/segment/builder.hpp" #include "mrc/segment/object.hpp" -#include "mrc/types.hpp" -#include "pymrc/node.hpp" #include "morpheus/messages/meta.hpp" -#include "morpheus/objects/table_info.hpp" +#include "morpheus/objects/table_info.hpp" // for TableInfo #include -#include #include #include -#include // for move +#include // for is_same_v +#include // for move + // IWYU thinks basic_stringbuf & map are needed for the regex constructor // IWYU pragma: no_include // IWYU pragma: no_include @@ -43,27 +38,29 @@ namespace morpheus { constexpr std::regex_constants::syntax_option_type RegexOptions = std::regex_constants::ECMAScript | std::regex_constants::icase; -// Component public implementations -// ************ WriteToFileStage **************************** // -SerializeStage::SerializeStage(const std::vector& include, - const std::vector& exclude, - bool fixed_columns) : - PythonNode(base_t::op_factory_from_sub_fn(build_operator())), +template +SerializeStage::SerializeStage(const std::vector& include, + const std::vector& exclude, + bool fixed_columns) : + base_t(base_t::op_factory_from_sub_fn(build_operator())), m_fixed_columns{fixed_columns} { make_regex_objs(include, m_include); make_regex_objs(exclude, m_exclude); } -void SerializeStage::make_regex_objs(const std::vector& regex_strs, std::vector& regex_objs) +template +void SerializeStage::make_regex_objs(const std::vector& regex_strs, + std::vector& regex_objs) { for (const auto& s : regex_strs) { - regex_objs.emplace_back(std::regex{s, RegexOptions}); + regex_objs.emplace_back(s, RegexOptions); } } -bool SerializeStage::match_column(const std::vector& patterns, const std::string& column) const +template +bool SerializeStage::match_column(const std::vector& patterns, const std::string& column) const { for (const auto& re : patterns) { @@ -75,7 +72,8 @@ bool SerializeStage::match_column(const std::vector& patterns, const return false; } -bool SerializeStage::include_column(const std::string& column) const +template +bool SerializeStage::include_column(const std::string& column) const { if (m_include.empty()) { @@ -87,12 +85,14 @@ bool SerializeStage::include_column(const std::string& column) const } } -bool SerializeStage::exclude_column(const std::string& column) const +template +bool SerializeStage::exclude_column(const std::string& column) const { return match_column(m_exclude, column); } -std::shared_ptr SerializeStage::get_meta(sink_type_t& msg) +template +std::shared_ptr SerializeStage::get_meta(sink_type_t& msg) { // If none of the columns match the include regex patterns or are all are excluded this has the effect // of including all of the rows since calling msg->get_meta({}) will return a view with all columns. @@ -100,7 +100,19 @@ std::shared_ptr SerializeStage::get_meta(sink_type_t& msg) if (!m_fixed_columns || m_column_names.empty()) { m_column_names.clear(); - for (const auto& c : msg->get_meta().get_column_names()) + + std::vector column_names; + + if constexpr (std::is_same_v) + { + column_names = msg->get_meta().get_column_names(); + } + else + { + column_names = msg->payload()->get_info().get_column_names(); + } + + for (const auto& c : column_names) { if (include_column(c) && !exclude_column(c)) { @@ -109,11 +121,19 @@ std::shared_ptr SerializeStage::get_meta(sink_type_t& msg) } } - return std::make_shared( - msg->meta, msg->mess_offset, msg->mess_offset + msg->mess_count, m_column_names); + if constexpr (std::is_same_v) + { + return std::make_shared( + msg->meta, msg->mess_offset, msg->mess_offset + msg->mess_count, m_column_names); + } + else + { + return std::make_shared(msg->payload(), 0, msg->payload()->count(), m_column_names); + } } -SerializeStage::subscribe_fn_t SerializeStage::build_operator() +template +SerializeStage::subscribe_fn_t SerializeStage::build_operator() { return [this](rxcpp::observable input, rxcpp::subscriber output) { return input.subscribe(rxcpp::make_observer( @@ -122,21 +142,41 @@ SerializeStage::subscribe_fn_t SerializeStage::build_operator() output.on_next(std::move(next_meta)); }, - [&](std::exception_ptr error_ptr) { output.on_error(error_ptr); }, - [&]() { output.on_completed(); })); + [&](std::exception_ptr error_ptr) { + output.on_error(error_ptr); + }, + [&]() { + output.on_completed(); + })); }; } -// ************ WriteToFileStageInterfaceProxy ************* // -std::shared_ptr> SerializeStageInterfaceProxy::init( +template class SerializeStage; +template class SerializeStage; + +// ************ SerializeStageInterfaceProxy ************* // +std::shared_ptr> SerializeStageInterfaceProxy::init_mm( mrc::segment::Builder& builder, const std::string& name, const std::vector& include, const std::vector& exclude, bool fixed_columns) { - auto stage = builder.construct_object(name, include, exclude, fixed_columns); + auto stage = builder.construct_object(name, include, exclude, fixed_columns); return stage; } + +std::shared_ptr> SerializeStageInterfaceProxy::init_cm( + mrc::segment::Builder& builder, + const std::string& name, + const std::vector& include, + const std::vector& exclude, + bool fixed_columns) +{ + auto stage = builder.construct_object(name, include, exclude, fixed_columns); + + return stage; +} + } // namespace morpheus diff --git a/morpheus/_lib/src/stages/write_to_file.cpp b/morpheus/_lib/src/stages/write_to_file.cpp index ea125b8c50..327c09df8b 100644 --- a/morpheus/_lib/src/stages/write_to_file.cpp +++ b/morpheus/_lib/src/stages/write_to_file.cpp @@ -15,15 +15,10 @@ * limitations under the License. */ -#include "morpheus/stages/write_to_file.hpp" // IWYU pragma: accosiated +#include "morpheus/stages/write_to_file.hpp" // IWYU pragma: associated -#include "mrc/node/rx_sink_base.hpp" -#include "mrc/node/rx_source_base.hpp" -#include "mrc/node/sink_properties.hpp" -#include "mrc/node/source_properties.hpp" #include "mrc/segment/builder.hpp" #include "mrc/segment/object.hpp" -#include "mrc/types.hpp" #include "pymrc/node.hpp" #include "morpheus/io/serializers.hpp" @@ -55,15 +50,21 @@ WriteToFileStage::WriteToFileStage( switch (file_type) { case FileTypes::JSON: { - m_write_func = [this](auto&& PH1) { write_json(std::forward(PH1)); }; + m_write_func = [this](auto&& PH1) { + write_json(std::forward(PH1)); + }; break; } case FileTypes::CSV: { - m_write_func = [this](auto&& PH1) { write_csv(std::forward(PH1)); }; + m_write_func = [this](auto&& PH1) { + write_csv(std::forward(PH1)); + }; break; } case FileTypes::PARQUET: { - m_write_func = [this](auto&& PH1) { write_parquet(std::forward(PH1)); }; + m_write_func = [this](auto&& PH1) { + write_parquet(std::forward(PH1)); + }; break; } case FileTypes::Auto: diff --git a/morpheus/_lib/src/utilities/http_server.cpp b/morpheus/_lib/src/utilities/http_server.cpp index 71479a802a..a6a58fd69c 100644 --- a/morpheus/_lib/src/utilities/http_server.cpp +++ b/morpheus/_lib/src/utilities/http_server.cpp @@ -21,36 +21,35 @@ #include "pymrc/utilities/function_wrappers.hpp" // for PyFuncWrapper -#include // for dispatch, make_address -#include // for basic_socket_acceptor<>::executor_type -#include // for basic_stream_socket -#include // for any_executor -#include // for acceptor, endpoint, socket, +#include // for dispatch, make_address +#include +#include // for basic_socket_acceptor<>::executor_type +#include // for basic_stream_socket +#include +#include // for acceptor, endpoint, socket, #include // for socket_base::reuse_address, socket_base, socket_base::max_listen_connections #include // for strand, make_strand, operator== #include // for bind_front_handler, error_code, flat_buffer, tcp_stream -#include // for basic_stream<>::socket_type #include // for bind_front_handler #include // for error_code #include // for flat_buffer -#include // for string_view -#include // for tcp_stream -#include // for read_async, request, response, verb, write_async -#include // for error, error::end_of_stream -#include // for field, field::content_type -#include // for message, response, request -#include // for request_parser, parser -#include // for status, status::not_found -#include // for string_body, basic_string_body, basic_string_body<>::value_type -#include // for verb, operator<<, verb::unknown -#include // for basic_string_view, operator<<, operator== -#include // for CHECK and LOG -#include // for cast +#include +#include // for tcp_stream +#include // for read_async, request, response, verb, write_async +#include // for error, error::end_of_stream +#include // for field, field::content_type +#include +#include // for message, response, request +#include // for request_parser, parser +#include // for status, status::not_found +#include // for string_body, basic_string_body, basic_string_body<>::value_type +#include // for verb, operator<<, verb::unknown +#include +#include // for CHECK and LOG #include #include // IWYU pragma: keep #include -#include // for array (indirectly used by the wrapped python callback function) #include // for exception #include // needed for glog #include // for runtime_error, length_error diff --git a/morpheus/_lib/src/utilities/matx_util.cu b/morpheus/_lib/src/utilities/matx_util.cu index 7f45fae162..a1dc626242 100644 --- a/morpheus/_lib/src/utilities/matx_util.cu +++ b/morpheus/_lib/src/utilities/matx_util.cu @@ -105,10 +105,12 @@ struct MatxUtil__MatxCreateSegIds auto output_tensor = matx::make_tensor(static_cast(output_data), shape); auto col0 = output_tensor.template Slice<1>({0, 0}, {matx::matxEnd, matx::matxDropDim}); + auto col1 = output_tensor.template Slice<1>({0, 1}, {matx::matxEnd, matx::matxDropDim}); auto col2 = output_tensor.template Slice<1>({0, 2}, {matx::matxEnd, matx::matxDropDim}); auto range_col = matx::range<0, tensorShape_1d, OutputT>({element_count}, start_idx, 1); (col0 = range_col).run(stream.value()); + (col1 = 0).run(stream.value()); (col2 = fea_len - 1).run(stream.value()); } }; diff --git a/morpheus/_lib/src/utilities/tensor_util.cpp b/morpheus/_lib/src/utilities/tensor_util.cpp index 08b25698c5..ef5b5dce97 100644 --- a/morpheus/_lib/src/utilities/tensor_util.cpp +++ b/morpheus/_lib/src/utilities/tensor_util.cpp @@ -24,7 +24,6 @@ #include // for make_ostream_joiner #include // for operator<<, ostream, stringstream #include // for char_traits, string -#include // for decay_t #include // for vector namespace morpheus { diff --git a/morpheus/_lib/stages/__init__.pyi b/morpheus/_lib/stages/__init__.pyi index 8f2addc910..515bab0c12 100644 --- a/morpheus/_lib/stages/__init__.pyi +++ b/morpheus/_lib/stages/__init__.pyi @@ -14,8 +14,10 @@ import mrc.core.segment import os __all__ = [ - "AddClassificationsStage", - "AddScoresStage", + "AddClassificationsControlMessageStage", + "AddClassificationsMultiResponseMessageStage", + "AddScoresControlMessageStage", + "AddScoresMultiResponseMessageStage", "DeserializeControlMessageStage", "DeserializeMultiMessageStage", "FileSourceStage", @@ -26,17 +28,26 @@ __all__ = [ "KafkaSourceStage", "PreallocateMessageMetaStage", "PreallocateMultiMessageStage", - "PreprocessFILStage", - "PreprocessNLPStage", - "SerializeStage", + "PreprocessFILControlMessageStage", + "PreprocessFILMultiMessageStage", + "PreprocessNLPControlMessageStage", + "PreprocessNLPMultiMessageStage", + "SerializeControlMessageStage", + "SerializeMultiMessageStage", "WriteToFileStage" ] -class AddClassificationsStage(mrc.core.segment.SegmentObject): +class AddClassificationsControlMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, idx2label: typing.Dict[int, str], threshold: float) -> None: ... pass -class AddScoresStage(mrc.core.segment.SegmentObject): +class AddClassificationsMultiResponseMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, idx2label: typing.Dict[int, str], threshold: float) -> None: ... + pass +class AddScoresControlMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, idx2label: typing.Dict[int, str]) -> None: ... + pass +class AddScoresMultiResponseMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, idx2label: typing.Dict[int, str]) -> None: ... pass class DeserializeControlMessageStage(mrc.core.segment.SegmentObject): @@ -72,13 +83,22 @@ class PreallocateMessageMetaStage(mrc.core.segment.SegmentObject): class PreallocateMultiMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, needed_columns: typing.List[typing.Tuple[str, morpheus._lib.common.TypeId]]) -> None: ... pass -class PreprocessFILStage(mrc.core.segment.SegmentObject): +class PreprocessFILControlMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, features: typing.List[str]) -> None: ... + pass +class PreprocessFILMultiMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, features: typing.List[str]) -> None: ... pass -class PreprocessNLPStage(mrc.core.segment.SegmentObject): +class PreprocessNLPControlMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, vocab_hash_file: str, sequence_length: int, truncation: bool, do_lower_case: bool, add_special_token: bool, stride: int, column: str) -> None: ... + pass +class PreprocessNLPMultiMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, vocab_hash_file: str, sequence_length: int, truncation: bool, do_lower_case: bool, add_special_token: bool, stride: int, column: str) -> None: ... pass -class SerializeStage(mrc.core.segment.SegmentObject): +class SerializeControlMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, include: typing.List[str], exclude: typing.List[str], fixed_columns: bool = True) -> None: ... + pass +class SerializeMultiMessageStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, include: typing.List[str], exclude: typing.List[str], fixed_columns: bool = True) -> None: ... pass class WriteToFileStage(mrc.core.segment.SegmentObject): diff --git a/morpheus/_lib/stages/module.cpp b/morpheus/_lib/stages/module.cpp index 738e534e9a..7b0d7ea293 100644 --- a/morpheus/_lib/stages/module.cpp +++ b/morpheus/_lib/stages/module.cpp @@ -15,10 +15,10 @@ * limitations under the License. */ -#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/control.hpp" #include "morpheus/messages/meta.hpp" #include "morpheus/messages/multi.hpp" -#include "morpheus/objects/file_types.hpp" // for FileTypes +#include "morpheus/objects/file_types.hpp" #include "morpheus/stages/add_classification.hpp" #include "morpheus/stages/add_scores.hpp" #include "morpheus/stages/deserialize.hpp" @@ -33,10 +33,10 @@ #include "morpheus/stages/serialize.hpp" #include "morpheus/stages/write_to_file.hpp" #include "morpheus/utilities/cudf_util.hpp" -#include "morpheus/utilities/http_server.hpp" // for DefaultMaxPayloadSize +#include "morpheus/utilities/http_server.hpp" #include "morpheus/version.hpp" -#include // for Builder +#include #include #include #include // for multiple_inheritance @@ -46,7 +46,7 @@ #include // for pymrc::import #include -#include // for std::filesystem::path +#include #include #include #include @@ -72,22 +72,43 @@ PYBIND11_MODULE(stages, _module) mrc::pymrc::from_import(_module, "morpheus._lib.common", "FilterSource"); - py::class_, + py::class_, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "AddClassificationsStage", py::multiple_inheritance()) - .def(py::init<>(&AddClassificationStageInterfaceProxy::init), + std::shared_ptr>>( + _module, "AddClassificationsMultiResponseMessageStage", py::multiple_inheritance()) + .def(py::init<>(&AddClassificationStageInterfaceProxy::init_multi), py::arg("builder"), py::arg("name"), py::arg("idx2label"), py::arg("threshold")); - py::class_, + py::class_, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "AddScoresStage", py::multiple_inheritance()) - .def( - py::init<>(&AddScoresStageInterfaceProxy::init), py::arg("builder"), py::arg("name"), py::arg("idx2label")); + std::shared_ptr>>( + _module, "AddClassificationsControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&AddClassificationStageInterfaceProxy::init_cm), + py::arg("builder"), + py::arg("name"), + py::arg("idx2label"), + py::arg("threshold")); + + py::class_, + mrc::segment::ObjectProperties, + std::shared_ptr>>( + _module, "AddScoresMultiResponseMessageStage", py::multiple_inheritance()) + .def(py::init<>(&AddScoresStageInterfaceProxy::init_multi), + py::arg("builder"), + py::arg("name"), + py::arg("idx2label")); + + py::class_, + mrc::segment::ObjectProperties, + std::shared_ptr>>( + _module, "AddScoresControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&AddScoresStageInterfaceProxy::init_cm), + py::arg("builder"), + py::arg("name"), + py::arg("idx2label")); py::class_>, mrc::segment::ObjectProperties, @@ -203,20 +224,44 @@ PYBIND11_MODULE(stages, _module) py::arg("name"), py::arg("needed_columns")); - py::class_, + py::class_, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "PreprocessFILStage", py::multiple_inheritance()) - .def(py::init<>(&PreprocessFILStageInterfaceProxy::init), + std::shared_ptr>>( + _module, "PreprocessFILMultiMessageStage", py::multiple_inheritance()) + .def(py::init<>(&PreprocessFILStageInterfaceProxy::init_multi), py::arg("builder"), py::arg("name"), py::arg("features")); - py::class_, + py::class_, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "PreprocessNLPStage", py::multiple_inheritance()) - .def(py::init<>(&PreprocessNLPStageInterfaceProxy::init), + std::shared_ptr>>( + _module, "PreprocessFILControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&PreprocessFILStageInterfaceProxy::init_cm), + py::arg("builder"), + py::arg("name"), + py::arg("features")); + + py::class_, + mrc::segment::ObjectProperties, + std::shared_ptr>>( + _module, "PreprocessNLPMultiMessageStage", py::multiple_inheritance()) + .def(py::init<>(&PreprocessNLPStageInterfaceProxy::init_multi), + py::arg("builder"), + py::arg("name"), + py::arg("vocab_hash_file"), + py::arg("sequence_length"), + py::arg("truncation"), + py::arg("do_lower_case"), + py::arg("add_special_token"), + py::arg("stride"), + py::arg("column")); + + py::class_, + mrc::segment::ObjectProperties, + std::shared_ptr>>( + _module, "PreprocessNLPControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&PreprocessNLPStageInterfaceProxy::init_cm), py::arg("builder"), py::arg("name"), py::arg("vocab_hash_file"), @@ -248,11 +293,22 @@ PYBIND11_MODULE(stages, _module) py::arg("lines") = false, py::arg("stop_after") = 0); - py::class_, + py::class_, + mrc::segment::ObjectProperties, + std::shared_ptr>>( + _module, "SerializeMultiMessageStage", py::multiple_inheritance()) + .def(py::init<>(&SerializeStageInterfaceProxy::init_mm), + py::arg("builder"), + py::arg("name"), + py::arg("include"), + py::arg("exclude"), + py::arg("fixed_columns") = true); + + py::class_, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "SerializeStage", py::multiple_inheritance()) - .def(py::init<>(&SerializeStageInterfaceProxy::init), + std::shared_ptr>>( + _module, "SerializeControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&SerializeStageInterfaceProxy::init_cm), py::arg("builder"), py::arg("name"), py::arg("include"), diff --git a/morpheus/_lib/tests/CMakeLists.txt b/morpheus/_lib/tests/CMakeLists.txt index b8330fb8bc..7e71bd2eb1 100644 --- a/morpheus/_lib/tests/CMakeLists.txt +++ b/morpheus/_lib/tests/CMakeLists.txt @@ -44,6 +44,7 @@ set_target_properties(test_cuda add_library( morpheus_test_utilities test_utils/common.cpp + test_utils/tensor_utils.cpp ) target_link_libraries( @@ -53,6 +54,12 @@ target_link_libraries( morpheus ) +# Ensure that the python executable is defined for the tests +target_compile_definitions(morpheus_test_utilities + PRIVATE + PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" +) + # Morpheus Tests function(add_morpheus_test) set(options) @@ -148,6 +155,15 @@ add_morpheus_test( test_multi_slices.cpp ) +add_morpheus_test( + NAME stages + FILES + stages/test_preprocess_nlp.cpp + stages/test_preprocess_fil.cpp + stages/test_add_scores.cpp + stages/test_add_classification.cpp +) + add_morpheus_test( NAME tensor FILES diff --git a/morpheus/_lib/tests/io/test_data_loader.cpp b/morpheus/_lib/tests/io/test_data_loader.cpp index 9d4df87a7c..c60dae5508 100644 --- a/morpheus/_lib/tests/io/test_data_loader.cpp +++ b/morpheus/_lib/tests/io/test_data_loader.cpp @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include diff --git a/morpheus/_lib/tests/io/test_data_loader_registry.cpp b/morpheus/_lib/tests/io/test_data_loader_registry.cpp index de11170270..bd8c0256e7 100644 --- a/morpheus/_lib/tests/io/test_data_loader_registry.cpp +++ b/morpheus/_lib/tests/io/test_data_loader_registry.cpp @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include @@ -55,12 +55,15 @@ TEST_F(TestDataLoaderRegistry, LoaderRegistryRegisterLoaderTest) // Should be able to overwrite an existing loader if we request it EXPECT_NO_THROW(LoaderRegistry::register_factory_fn( "LoaderRegistryRegisterLoaderTest", - [](nlohmann::json config) { return std::make_unique(config); }, + [](nlohmann::json config) { + return std::make_unique(config); + }, false)); - EXPECT_THROW(LoaderRegistry::register_factory_fn( - "LoaderRegistryRegisterLoaderTest", - [](nlohmann::json config) { return std::make_unique(config); }), + EXPECT_THROW(LoaderRegistry::register_factory_fn("LoaderRegistryRegisterLoaderTest", + [](nlohmann::json config) { + return std::make_unique(config); + }), std::runtime_error); } diff --git a/morpheus/_lib/tests/io/test_loaders.cpp b/morpheus/_lib/tests/io/test_loaders.cpp index f526c54cdb..67ae9a1c87 100644 --- a/morpheus/_lib/tests/io/test_loaders.cpp +++ b/morpheus/_lib/tests/io/test_loaders.cpp @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include diff --git a/morpheus/_lib/tests/llm/test_llm_task.cpp b/morpheus/_lib/tests/llm/test_llm_task.cpp index 9ec4575109..7f609995f4 100644 --- a/morpheus/_lib/tests/llm/test_llm_task.cpp +++ b/morpheus/_lib/tests/llm/test_llm_task.cpp @@ -20,6 +20,7 @@ #include "morpheus/llm/llm_task.hpp" #include +#include using namespace morpheus; using namespace morpheus::test; diff --git a/morpheus/_lib/tests/llm/test_utils.cpp b/morpheus/_lib/tests/llm/test_utils.cpp index 7de9e097f8..10bc0b1013 100644 --- a/morpheus/_lib/tests/llm/test_utils.cpp +++ b/morpheus/_lib/tests/llm/test_utils.cpp @@ -18,19 +18,11 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated #include "morpheus/llm/input_map.hpp" -#include "morpheus/llm/llm_context.hpp" -#include "morpheus/llm/llm_lambda_node.hpp" -#include "morpheus/llm/llm_node.hpp" -#include "morpheus/llm/llm_node_runner.hpp" -#include "morpheus/llm/llm_task.hpp" #include "morpheus/llm/utils.hpp" #include "morpheus/types.hpp" #include -#include -#include -#include #include #include #include diff --git a/morpheus/_lib/tests/messages/test_control_message.cpp b/morpheus/_lib/tests/messages/test_control_message.cpp index 7fe86afd6c..11eb5353b2 100644 --- a/morpheus/_lib/tests/messages/test_control_message.cpp +++ b/morpheus/_lib/tests/messages/test_control_message.cpp @@ -16,28 +16,31 @@ */ #include "../test_utils/common.hpp" // IWYU pragma: associated -#include "test_messages.hpp" +#include "test_messages.hpp" // for TestMessages -#include "morpheus/messages/control.hpp" -#include "morpheus/messages/memory/tensor_memory.hpp" -#include "morpheus/messages/meta.hpp" +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta -#include -#include +#include // for Message, TestPartResult, AssertionResult, TestInfo +#include // for basic_json, json_ref, json -#include -#include -#include -#include -#include -#include -#include +#include // for find +#include // for system_clock +#include // for map +#include // for allocator, make_shared, shared_ptr +#include // for optional +#include // for runtime_error +#include // for operator<=>, string, char_traits, basic_string +#include // for vector using namespace morpheus; using namespace morpheus::test; using clock_type_t = std::chrono::system_clock; +using TestControlMessage = morpheus::test::TestMessages; // NOLINT(readability-identifier-naming) + TEST_F(TestControlMessage, InitializationTest) { auto msg_one = ControlMessage(); @@ -330,4 +333,4 @@ TEST_F(TestControlMessage, GetTensorMemoryWhenNoneSet) // Verify that the retrieved tensor memory is nullptr EXPECT_EQ(nullptr, retrievedTensorMemory); -} \ No newline at end of file +} diff --git a/morpheus/_lib/tests/messages/test_message_meta.cpp b/morpheus/_lib/tests/messages/test_message_meta.cpp new file mode 100644 index 0000000000..9724704c1c --- /dev/null +++ b/morpheus/_lib/tests/messages/test_message_meta.cpp @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // IWYU pragma: associated +#include "test_messages.hpp" + +#include "morpheus/io/deserializers.hpp" // for load_table_from_file, prepare_df_index +#include "morpheus/messages/meta.hpp" // for MessageMeta and SlicedMessageMeta +#include "morpheus/objects/rmm_tensor.hpp" +#include "morpheus/objects/table_info.hpp" // for TableInfo +#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper + +#include +#include +#include // for gil_scoped_release, gil_scoped_acquire +#include // IWYU pragma: keep + +#include // for std::filesystem::path +#include // for shared_ptr +#include // for move + +using namespace morpheus; + +using TestMessageMeta = morpheus::test::TestMessages; // NOLINT(readability-identifier-naming) + +TEST_F(TestMessageMeta, SetMetaWithColumnName) +{ + pybind11::gil_scoped_release no_gil; + auto test_data_dir = test::get_morpheus_root() / "tests/tests_data"; + std::filesystem::path input_file = test_data_dir / "csv_sample.csv"; + + auto table = load_table_from_file(input_file); + auto meta = MessageMeta::create_from_cpp(std::move(table)); + + const std::size_t count = 3; + DType int_type(TypeId::INT64); + std::vector expected_ints{4, 5, 6}; + auto buffer = std::make_shared(count * int_type.item_size(), rmm::cuda_stream_per_thread); + + MRC_CHECK_CUDA(cudaMemcpy(buffer->data(), expected_ints.data(), buffer->size(), cudaMemcpyHostToDevice)); + + ShapeType shape{3, 1}; + auto tensor = std::make_shared(buffer, 0, int_type, shape); + TensorObject tensor_object(tensor); + meta->set_data("int", tensor_object); + + std::vector actual_ints(expected_ints.size()); + + auto cm_int_meta = meta->get_info().get_column(0); + MRC_CHECK_CUDA( + cudaMemcpy(actual_ints.data(), cm_int_meta.data(), count * sizeof(int64_t), cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_ints, actual_ints); +} diff --git a/morpheus/_lib/tests/messages/test_messages.hpp b/morpheus/_lib/tests/messages/test_messages.hpp index ba5b4ea4ff..cf53f6ea2a 100644 --- a/morpheus/_lib/tests/messages/test_messages.hpp +++ b/morpheus/_lib/tests/messages/test_messages.hpp @@ -19,7 +19,26 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated +#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper + +#include + namespace morpheus::test { -using TestControlMessage = TestWithPythonInterpreter; // NOLINT -} // namespace morpheus::test \ No newline at end of file +class TestMessages : public morpheus::test::TestWithPythonInterpreter +{ + protected: + void SetUp() override + { + morpheus::test::TestWithPythonInterpreter::SetUp(); + { + pybind11::gil_scoped_acquire gil; + + // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: + // https://github.com/rapidsai/cudf/issues/12862 + CudfHelper::load(); + } + } +}; + +} // namespace morpheus::test diff --git a/morpheus/_lib/tests/messages/test_sliced_message_meta.cpp b/morpheus/_lib/tests/messages/test_sliced_message_meta.cpp index b0aa051007..d7e18d3bd9 100644 --- a/morpheus/_lib/tests/messages/test_sliced_message_meta.cpp +++ b/morpheus/_lib/tests/messages/test_sliced_message_meta.cpp @@ -16,11 +16,11 @@ */ #include "../test_utils/common.hpp" // IWYU pragma: associated +#include "test_messages.hpp" -#include "morpheus/io/deserializers.hpp" // for load_table_from_file, prepare_df_index -#include "morpheus/messages/meta.hpp" // for MessageMeta and SlicedMessageMeta -#include "morpheus/objects/table_info.hpp" // for TableInfo -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper +#include "morpheus/io/deserializers.hpp" // for load_table_from_file, prepare_df_index +#include "morpheus/messages/meta.hpp" // for MessageMeta and SlicedMessageMeta +#include "morpheus/objects/table_info.hpp" // for TableInfo #include #include // for gil_scoped_release, gil_scoped_acquire @@ -32,21 +32,7 @@ using namespace morpheus; -class TestSlicedMessageMeta : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +using TestSlicedMessageMeta = morpheus::test::TestMessages; // NOLINT(readability-identifier-naming) TEST_F(TestSlicedMessageMeta, TestCount) { diff --git a/morpheus/_lib/tests/modules/test_data_loader_module.cpp b/morpheus/_lib/tests/modules/test_data_loader_module.cpp index 4b04b091a7..5615657d05 100644 --- a/morpheus/_lib/tests/modules/test_data_loader_module.cpp +++ b/morpheus/_lib/tests/modules/test_data_loader_module.cpp @@ -39,7 +39,6 @@ #include #include -#include #include #include diff --git a/morpheus/_lib/tests/stages/test_add_classification.cpp b/morpheus/_lib/tests/stages/test_add_classification.cpp new file mode 100644 index 0000000000..8570edd7ac --- /dev/null +++ b/morpheus/_lib/tests/stages/test_add_classification.cpp @@ -0,0 +1,137 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // for get_morpheus_root, TEST_CLASS, morpheus + +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/objects/dtype.hpp" // for DType +#include "morpheus/objects/table_info.hpp" // for TableInfo +#include "morpheus/objects/tensor.hpp" // for Tensor +#include "morpheus/stages/add_classification.hpp" // for AddClassificationsStage +#include "morpheus/types.hpp" // for TensorIndex + +#include // for cudaMemcpy, cudaMemcpyKind +#include // for column_view +#include // for csv_reader_options_builder, read_csv, csv_reader_options +#include // for source_info, table_with_metadata +#include // for data_type +#include // for type_to_id +#include // for EXPECT_EQ, Message, TestInfo, TestPartResult, TEST_F +#include // for __check_cuda_errors, MRC_CHECK_CUDA +#include // for gil_scoped_release +#include // for cuda_stream_per_thread +#include // for device_buffer + +#include // for size_t +#include // for uint8_t +#include // for operator/, path +#include // for map +#include // for make_shared, allocator, __shared_ptr_access, shared_ptr +#include // for string +#include // for move +#include // for vector + +using namespace morpheus; + +TEST_CLASS_WITH_PYTHON(AddClassification); + +template +auto convert_to_host(rmm::device_buffer& buffer) +{ + std::vector host_buffer(buffer.size() / sizeof(T)); + + MRC_CHECK_CUDA(cudaMemcpy(host_buffer.data(), buffer.data(), buffer.size(), cudaMemcpyDeviceToHost)); + + return host_buffer; +} + +TEST_F(TestAddClassification, TestProcessControlMessageAndMultiResponseMessage) +{ + pybind11::gil_scoped_release no_gil; + auto test_data_dir = test::get_morpheus_root() / "tests/tests_data"; + std::filesystem::path input_file = test_data_dir / "bools.csv"; + + TensorIndex cols_size = 3; + TensorIndex mess_count = 3; + + double threshold = 0.4; + + auto packed_data_host = std::vector{ + 0.1, + 0.2, + 0.3, // All below + 0.5, + 0.0, + 0.0, // Only one above + 0.7, + 0.1, + 0.9 // All above + }; + + auto packed_data = std::make_shared( + packed_data_host.data(), cols_size * mess_count * sizeof(double), rmm::cuda_stream_per_thread); + + cudf::io::csv_reader_options read_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info(input_file)) + .dtypes({cudf::data_type(cudf::data_type{cudf::type_to_id()})}) + .header(0); + auto meta_mm = MessageMeta::create_from_cpp(cudf::io::read_csv(read_opts)); + + std::map idx2label = {{0, "bool"}}; + + // Create MultiResponseMessage + auto tensor = Tensor::create(packed_data, DType::create(), {mess_count, cols_size}, {}, 0); + auto tensor_memory = std::make_shared(mess_count); + tensor_memory->set_tensor("probs", std::move(tensor)); + auto mm = std::make_shared(std::move(meta_mm), 0, mess_count, std::move(tensor_memory)); + + // Create PreProcessMultiMessageStage + auto mm_stage = std::make_shared(idx2label, 0.4); + auto mm_response = mm_stage->on_data(mm); + + // Create a separate dataframe from a file (otherwise they will overwrite eachother) + auto meta_cm = MessageMeta::create_from_cpp(cudf::io::read_csv(read_opts)); + + // Create ControlMessage + auto cm = std::make_shared(); + cm->payload(std::move(meta_cm)); + auto cm_tensor = Tensor::create(packed_data, DType::create(), {mess_count, cols_size}, {}, 0); + auto cm_tensor_memory = std::make_shared(mess_count); + cm_tensor_memory->set_tensor("probs", std::move(cm_tensor)); + cm->tensors(cm_tensor_memory); + + // Create PreProcessControlMessageStage + auto cm_stage = std::make_shared(idx2label, 0.4); + auto cm_response = cm_stage->on_data(cm); + + // Verify the output meta + std::vector expected_meta = {'\0', '\x1', '\x1'}; + auto mm_meta = mm_response->get_meta().get_column(0); + auto cm_meta = cm_response->payload()->get_info().get_column(0); + + // std::vector is a template specialization which does not have data() method, use std::vector here + std::vector mm_meta_host(mm_meta.size()); + std::vector cm_meta_host(cm_meta.size()); + MRC_CHECK_CUDA( + cudaMemcpy(mm_meta_host.data(), mm_meta.data(), mm_meta.size() * sizeof(bool), cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA( + cudaMemcpy(cm_meta_host.data(), cm_meta.data(), cm_meta.size() * sizeof(bool), cudaMemcpyDeviceToHost)); + EXPECT_EQ(mm_meta_host, expected_meta); + EXPECT_EQ(mm_meta_host, cm_meta_host); +} diff --git a/morpheus/_lib/tests/stages/test_add_scores.cpp b/morpheus/_lib/tests/stages/test_add_scores.cpp new file mode 100644 index 0000000000..1bfd3a79b1 --- /dev/null +++ b/morpheus/_lib/tests/stages/test_add_scores.cpp @@ -0,0 +1,118 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // for get_morpheus_root, TEST_CLASS, morpheus +#include "../test_utils/tensor_utils.hpp" + +#include "morpheus/io/deserializers.hpp" // for load_table_from_file +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/messages/multi_response.hpp" // for MultiResponseMessage +#include "morpheus/objects/dtype.hpp" // for DType +#include "morpheus/objects/table_info.hpp" // for TableInfo +#include "morpheus/objects/tensor.hpp" // for Tensor +#include "morpheus/stages/add_scores.hpp" // for AddScoresStage +#include "morpheus/stages/preallocate.hpp" +#include "morpheus/types.hpp" // for TensorIndex + +#include // for EXPECT_EQ, Message, TestInfo, TestPartResult, TEST_F +#include // for gil_scoped_release +#include // for cuda_stream_per_thread +#include // for device_buffer + +#include // for size_t +#include // for operator/, path +#include // for map +#include // for make_shared, allocator, __shared_ptr_access, shared_ptr +#include // for string +#include +#include // for move +#include // for vector + +using namespace morpheus::test; + +using namespace morpheus; + +TEST_CLASS_WITH_PYTHON(AddScores); + +TEST_F(TestAddScores, TestProcessControlMessageAndMultiResponseMessage) +{ + pybind11::gil_scoped_release no_gil; + auto test_data_dir = test::get_morpheus_root() / "tests/tests_data"; + std::filesystem::path input_file = test_data_dir / "floats.csv"; + + TensorIndex cols_size = 2; + TensorIndex mess_count = 3; + + auto packed_data_host = std::vector{ + 0.1, + 1.0, + 0, + 23456, + 1.4013e-45, + 9.3e5, + }; + + auto packed_data = std::make_shared( + packed_data_host.data(), cols_size * mess_count * sizeof(double), rmm::cuda_stream_per_thread); + + // Create a dataframe from a file + auto meta_mm = MessageMeta::create_from_cpp(load_table_from_file(input_file)); + preallocate(meta_mm, {{"colA", TypeId::FLOAT64}, {"colB", TypeId::FLOAT64}}); + + std::map idx2label = {{0, "colA"}, {1, "colB"}}; + + // Create MultiResponseMessage + auto tensor = Tensor::create(packed_data, DType::create(), {mess_count, cols_size}, {}, 0); + auto tensor_memory = std::make_shared(mess_count); + tensor_memory->set_tensor("probs", std::move(tensor)); + auto mm = std::make_shared(std::move(meta_mm), 0, mess_count, std::move(tensor_memory)); + + // Create PreProcessMultiMessageStage + auto mm_stage = std::make_shared(idx2label); + auto mm_response = mm_stage->on_data(mm); + + // Create a separate dataframe from a file (otherwise they will overwrite eachother) + auto meta_cm = MessageMeta::create_from_cpp(load_table_from_file(input_file)); + preallocate(meta_cm, {{"colA", TypeId::FLOAT64}, {"colB", TypeId::FLOAT64}}); + + // Create ControlMessage + auto cm = std::make_shared(); + cm->payload(std::move(meta_cm)); + auto cm_tensor = Tensor::create(packed_data, DType::create(), {mess_count, cols_size}, {}, 0); + auto cm_tensor_memory = std::make_shared(mess_count); + cm_tensor_memory->set_tensor("probs", std::move(cm_tensor)); + cm->tensors(cm_tensor_memory); + + // Create PreProcessControlMessageStage + auto cm_stage = std::make_shared(idx2label); + auto cm_response = cm_stage->on_data(cm); + + // Verify the output meta + std::vector expected_colA = {0.1, 0, 1.4013e-45}; + std::vector expected_colB = {1.0, 23456, 9.3e5}; + + auto mm_table = mm_response->get_meta(std::vector{"colA", "colB"}); + auto cm_table = cm_response->payload()->get_info(std::vector{"colA", "colB"}); + + assert_eq_device_to_host(mm_table.get_column(0), expected_colA); + assert_eq_device_to_host(mm_table.get_column(1), expected_colB); + + assert_eq_device_to_host(cm_table.get_column(0), expected_colA); + assert_eq_device_to_host(cm_table.get_column(1), expected_colB); +} diff --git a/morpheus/_lib/tests/stages/test_preprocess_fil.cpp b/morpheus/_lib/tests/stages/test_preprocess_fil.cpp new file mode 100644 index 0000000000..d290a81c3e --- /dev/null +++ b/morpheus/_lib/tests/stages/test_preprocess_fil.cpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // for get_morpheus_root, TEST_CLASS, morpheus + +#include "morpheus/io/deserializers.hpp" // for load_table_from_file +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/messages/multi.hpp" // for MultiMessage +#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage +#include "morpheus/objects/tensor_object.hpp" // for TensorObject +#include "morpheus/stages/preprocess_fil.hpp" // for PreprocessFILStage, PreprocessFILStageCC, PreprocessFI... +#include "morpheus/types.hpp" // for TensorIndex + +#include // for cudaMemcpy, cudaMemcpyKind +#include // for EXPECT_EQ, Message, TestPartResult, TestInfo, TEST_F +#include // for __check_cuda_errors, MRC_CHECK_CUDA +#include // for gil_scoped_release + +#include // for path, operator/ +#include // for allocator, make_shared, __shared_ptr_access, shared_ptr +#include // for string +#include // for move +#include // for vector + +using namespace morpheus; + +TEST_CLASS_WITH_PYTHON(PreprocessFIL); + +TEST_F(TestPreprocessFIL, TestProcessControlMessageAndMultiMessage) +{ + pybind11::gil_scoped_release no_gil; + auto test_data_dir = test::get_morpheus_root() / "tests/tests_data"; + std::filesystem::path input_file = test_data_dir / "float_str.csv"; + + // Create a dataframe from a file + auto cm_table = load_table_from_file(input_file); + auto cm_meta = MessageMeta::create_from_cpp(std::move(cm_table)); + + auto mm_table = load_table_from_file(input_file); + auto mm_meta = MessageMeta::create_from_cpp(std::move(mm_table)); + + // Create ControlMessage + auto cm = std::make_shared(); + cm->payload(cm_meta); + + // Create PreProcessControlMessageStage + auto cm_stage = std::make_shared(std::vector{"float_str1", "float_str2"}); + auto cm_response = cm_stage->on_data(cm); + + // Create MultiMessage + auto mm = std::make_shared(mm_meta); + // Create PreProcessMultiMessageStage + auto mm_stage = std::make_shared(std::vector{"float_str1", "float_str2"}); + auto mm_response = mm_stage->on_data(mm); + + auto cm_tensors = cm_response->tensors(); + auto mm_tensors = mm_response->memory; + + // Verify output tensors + std::vector expected_input__0 = {1, 4, 2, 5, 3, 6}; + auto cm_input__0 = cm_tensors->get_tensor("input__0"); + auto mm_input__0 = mm_tensors->get_tensor("input__0"); + std::vector cm_input__0_host(cm_input__0.count()); + std::vector mm_input__0_host(mm_input__0.count()); + MRC_CHECK_CUDA(cudaMemcpy( + cm_input__0_host.data(), cm_input__0.data(), cm_input__0.count() * sizeof(float), cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA(cudaMemcpy( + mm_input__0_host.data(), mm_input__0.data(), mm_input__0.count() * sizeof(float), cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_input__0, cm_input__0_host); + EXPECT_EQ(cm_input__0_host, mm_input__0_host); + + std::vector expected_seq_ids = {0, 0, 1, 1, 0, 1, 2, 0, 1}; + auto cm_seq_ids = cm_tensors->get_tensor("seq_ids"); + auto mm_seq_ids = mm_tensors->get_tensor("seq_ids"); + std::vector cm_seq_ids_host(cm_seq_ids.count()); + std::vector mm_seq_ids_host(mm_seq_ids.count()); + MRC_CHECK_CUDA(cudaMemcpy( + cm_seq_ids_host.data(), cm_seq_ids.data(), cm_seq_ids.count() * sizeof(TensorIndex), cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA(cudaMemcpy( + mm_seq_ids_host.data(), mm_seq_ids.data(), mm_seq_ids.count() * sizeof(TensorIndex), cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_seq_ids, cm_seq_ids_host); + EXPECT_EQ(cm_seq_ids_host, mm_seq_ids_host); +} diff --git a/morpheus/_lib/tests/stages/test_preprocess_nlp.cpp b/morpheus/_lib/tests/stages/test_preprocess_nlp.cpp new file mode 100644 index 0000000000..229c593c18 --- /dev/null +++ b/morpheus/_lib/tests/stages/test_preprocess_nlp.cpp @@ -0,0 +1,129 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // for get_morpheus_root, TestWithPythonInterpreter, morpheus + +#include "morpheus/io/deserializers.hpp" // for load_table_from_file +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/messages/multi.hpp" // for MultiMessage +#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage +#include "morpheus/objects/tensor_object.hpp" // for TensorObject +#include "morpheus/stages/preprocess_nlp.hpp" // for PreprocessNLPStage, PreprocessNLPStageCC, PreprocessNL... +#include "morpheus/types.hpp" // for TensorIndex + +#include // for cudaMemcpy, cudaMemcpyKind +#include // for EXPECT_EQ, Message, TestPartResult, TestInfo, TEST_F +#include // for __check_cuda_errors, MRC_CHECK_CUDA +#include // for gil_scoped_acquire, gil_scoped_release + +#include // for int32_t +#include // for operator/, path +#include // for allocator, make_shared, __shared_ptr_access, shared_ptr +#include // for move +#include // for vector + +using namespace morpheus; + +TEST_CLASS_WITH_PYTHON(PreprocessNLP); + +TEST_F(TestPreprocessNLP, TestProcessControlMessageAndMultiMessage) +{ + pybind11::gil_scoped_release no_gil; + auto test_data_dir = test::get_morpheus_root() / "tests/tests_data"; + std::filesystem::path input_file = test_data_dir / "countries_sample.csv"; + + auto test_vocab_hash_file_dir = test::get_morpheus_root() / "morpheus/data"; + std::filesystem::path vocab_hash_file = test_vocab_hash_file_dir / "bert-base-cased-hash.txt"; + + // Create a dataframe from a file + auto table = load_table_from_file(input_file); + auto meta = MessageMeta::create_from_cpp(std::move(table)); + + // Create ControlMessage + auto cm = std::make_shared(); + cm->payload(meta); + + // Create PreProcessControlMessageStage + auto cm_stage = std::make_shared(vocab_hash_file /*vocab_hash_file*/, + 1 /*sequence_length*/, + false /*truncation*/, + false /*do_lower_case*/, + false /*add_special_token*/, + 1 /*stride*/, + "country" /*column*/); + + auto cm_response = cm_stage->on_data(cm); + + // Create MultiMessage + auto mm = std::make_shared(meta); + + // Create PreProcessMultiMessageStage + auto mm_stage = std::make_shared(vocab_hash_file /*vocab_hash_file*/, + 1 /*sequence_length*/, + false /*truncation*/, + false /*do_lower_case*/, + false /*add_special_token*/, + 1 /*stride*/, + "country" /*column*/); + auto mm_response = mm_stage->on_data(mm); + + auto cm_tensors = cm_response->tensors(); + auto mm_tensors = mm_response->memory; + + // Verify output tensors + std::vector expected_input_ids = {6469, 10278, 11347, 1262, 27583, 13833}; + auto cm_input_ids = cm_tensors->get_tensor("input_ids"); + auto mm_input_ids = mm_tensors->get_tensor("input_ids"); + std::vector cm_input_ids_host(cm_input_ids.count()); + std::vector mm_input_ids_host(mm_input_ids.count()); + MRC_CHECK_CUDA(cudaMemcpy( + cm_input_ids_host.data(), cm_input_ids.data(), cm_input_ids.count() * sizeof(int32_t), cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA(cudaMemcpy( + mm_input_ids_host.data(), mm_input_ids.data(), mm_input_ids.count() * sizeof(int32_t), cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_input_ids, cm_input_ids_host); + EXPECT_EQ(cm_input_ids_host, mm_input_ids_host); + + std::vector expected_input_mask = {1, 1, 1, 1, 1, 1}; + auto cm_input_mask = cm_tensors->get_tensor("input_mask"); + auto mm_input_mask = mm_tensors->get_tensor("input_mask"); + std::vector cm_input_mask_host(cm_input_mask.count()); + std::vector mm_input_mask_host(mm_input_mask.count()); + MRC_CHECK_CUDA(cudaMemcpy(cm_input_mask_host.data(), + cm_input_mask.data(), + cm_input_mask.count() * sizeof(int32_t), + cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA(cudaMemcpy(mm_input_mask_host.data(), + mm_input_mask.data(), + mm_input_mask.count() * sizeof(int32_t), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_input_mask, cm_input_mask_host); + EXPECT_EQ(cm_input_mask_host, mm_input_mask_host); + + std::vector expected_seq_ids = {0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 3, 0, 0, 4, 0, 0}; + auto cm_seq_ids = cm_tensors->get_tensor("seq_ids"); + auto mm_seq_ids = mm_tensors->get_tensor("seq_ids"); + std::vector cm_seq_ids_host(cm_seq_ids.count()); + std::vector mm_seq_ids_host(mm_seq_ids.count()); + MRC_CHECK_CUDA(cudaMemcpy( + cm_seq_ids_host.data(), cm_seq_ids.data(), cm_seq_ids.count() * sizeof(TensorIndex), cudaMemcpyDeviceToHost)); + MRC_CHECK_CUDA(cudaMemcpy( + mm_seq_ids_host.data(), mm_seq_ids.data(), mm_seq_ids.count() * sizeof(TensorIndex), cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected_seq_ids, cm_seq_ids_host); + EXPECT_EQ(cm_seq_ids_host, mm_seq_ids_host); +} diff --git a/morpheus/_lib/tests/test_dev_mem_info.cpp b/morpheus/_lib/tests/test_dev_mem_info.cpp index 87ea8d158d..8b0a8b8a65 100644 --- a/morpheus/_lib/tests/test_dev_mem_info.cpp +++ b/morpheus/_lib/tests/test_dev_mem_info.cpp @@ -22,6 +22,7 @@ #include "morpheus/objects/memory_descriptor.hpp" #include "morpheus/types.hpp" // for ShapeType, TensorIndex +#include #include // for AssertionResult, SuiteApiResolver, TestInfo, EXPECT_TRUE, Message, TEST_F, Test, TestFactoryImpl, TestPartResult #include #include @@ -33,7 +34,6 @@ #include // for size_t #include // shared_ptr -#include // for vector // IWYU pragma: no_include "thrust/iterator/iterator_facade.h" // IWYU pragma: no_include diff --git a/morpheus/_lib/tests/test_file_in_out.cpp b/morpheus/_lib/tests/test_file_in_out.cpp index ae6266247e..552e5bb8a7 100644 --- a/morpheus/_lib/tests/test_file_in_out.cpp +++ b/morpheus/_lib/tests/test_file_in_out.cpp @@ -32,7 +32,6 @@ #include #include // IWYU pragma: keep #include // for shared_ptr -#include // for stringstream #include #include // for move #include diff --git a/morpheus/_lib/tests/test_tensor.cpp b/morpheus/_lib/tests/test_tensor.cpp index 236fea83d0..dd6dbc96ae 100644 --- a/morpheus/_lib/tests/test_tensor.cpp +++ b/morpheus/_lib/tests/test_tensor.cpp @@ -25,6 +25,7 @@ #include "morpheus/types.hpp" // for ShapeType, TensorIndex #include "morpheus/utilities/tensor_util.hpp" // for TensorUtils +#include #include #include // for AssertionResult, SuiteApiResolver, TestInfo, EXPECT_TRUE, Message, TEST_F, Test, TestFactoryImpl, TestPartResult #include @@ -36,6 +37,7 @@ #include // shared_ptr #include // for allocator, operator==, basic_string, string #include // for vector + // IWYU pragma: no_include "morpheus/utilities/string_util.hpp" // IWYU thinks we need ext/new_allocator.h for size_t for some reason // IWYU pragma: no_include diff --git a/morpheus/_lib/tests/test_utils/common.cpp b/morpheus/_lib/tests/test_utils/common.cpp index 92a12e1eb3..1c8eb86fa8 100644 --- a/morpheus/_lib/tests/test_utils/common.cpp +++ b/morpheus/_lib/tests/test_utils/common.cpp @@ -23,31 +23,37 @@ #include "morpheus/io/loaders/payload.hpp" #include "morpheus/io/loaders/rest.hpp" #include "morpheus/messages/meta.hpp" +#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/string_util.hpp" +#include // for PyStatus_Exception, PyConfig_Clear, PyConfig_InitPythonConfig #include -#include -#include #include #include #include +#include // for Py_InitializeFromConfig -#include #include +#include // for codecvt_utf8_utf16 #include #include #include +#include #include #include #include +#ifndef PYTHON_EXECUTABLE + #error PYTHON_EXECUTABLE must be defined to run tests +#endif + namespace morpheus::test { bool TestWithPythonInterpreter::m_initialized = false; void TestWithPythonInterpreter::SetUp() { - initialize_interpreter(); + this->initialize_interpreter(); LoaderRegistry::register_factory_fn( "file", @@ -73,6 +79,11 @@ void TestWithPythonInterpreter::SetUp() return std::make_unique(config); }, false); + + pybind11::gil_scoped_acquire gil; + + // Ensure that the cudf helpers are loaded so we can convert dataframes to MessageMeta + CudfHelper::load(); } void TestWithPythonInterpreter::TearDown() {} @@ -81,7 +92,43 @@ void TestWithPythonInterpreter::initialize_interpreter() const { if (!m_initialized) { - pybind11::initialize_interpreter(); + using namespace std::string_literals; + + // NOTE: We manually initialize the Python interpreter here because we need to specify the Python executable to + // use in order to enable virtual environments. Otherwise, the Python interpreter will be initialized with the + // default executable, which may not be the one we want to use (and will make it difficult to discover why tests + // are failing). + PyConfig config; + PyConfig_InitPythonConfig(&config); + + // Create a wstring from the PYTHON_EXECUTABLE string + std::wstring_convert> converter; + + auto python_exe_w = converter.from_bytes(PYTHON_EXECUTABLE); + + // Set the program name to the python executable to ensure any virtualenvs are loaded correctly + PyStatus status = PyConfig_SetString(&config, &config.program_name, python_exe_w.data()); + if (PyStatus_Exception(status)) + { + throw std::runtime_error("Failed to set Python program name to "s + PYTHON_EXECUTABLE); + } + + // Load the remainder of the configuration + status = PyConfig_Read(&config); + if (PyStatus_Exception(status)) + { + throw std::runtime_error("Failed to read Python configuration"); + } + + status = Py_InitializeFromConfig(&config); + if (PyStatus_Exception(status)) + { + throw std::runtime_error("Failed to initialize Python interpreter"); + } + + // Cleanup the configuration object + PyConfig_Clear(&config); + m_initialized = true; } } diff --git a/morpheus/_lib/tests/test_utils/common.hpp b/morpheus/_lib/tests/test_utils/common.hpp index 2b681635c8..5413b1f898 100644 --- a/morpheus/_lib/tests/test_utils/common.hpp +++ b/morpheus/_lib/tests/test_utils/common.hpp @@ -32,6 +32,10 @@ void SetUp() override {} \ } +#define TEST_CLASS_WITH_PYTHON(name) \ + class __attribute__((visibility("default"))) Test##name : public morpheus::test::TestWithPythonInterpreter \ + {} + namespace morpheus { class MessageMeta; } diff --git a/morpheus/_lib/tests/test_utils/tensor_utils.cpp b/morpheus/_lib/tests/test_utils/tensor_utils.cpp new file mode 100644 index 0000000000..a5e65ef994 --- /dev/null +++ b/morpheus/_lib/tests/test_utils/tensor_utils.cpp @@ -0,0 +1,20 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "./tensor_utils.hpp" + +namespace morpheus::test {} // namespace morpheus::test diff --git a/morpheus/_lib/tests/test_utils/tensor_utils.hpp b/morpheus/_lib/tests/test_utils/tensor_utils.hpp new file mode 100644 index 0000000000..d7896607e2 --- /dev/null +++ b/morpheus/_lib/tests/test_utils/tensor_utils.hpp @@ -0,0 +1,84 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // for cudaMemcpy, cudaMemcpyKind +#include +#include // for data_type +#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include +#include + +#include // for char_traits, operator<<, basic_ostream +#include + +namespace morpheus::test { + +template +auto convert_to_host(const rmm::device_buffer& buffer) +{ + std::vector host_buffer(buffer.size() / sizeof(T)); + + MRC_CHECK_CUDA(cudaMemcpy(host_buffer.data(), buffer.data(), buffer.size(), cudaMemcpyDeviceToHost)); + + return host_buffer; +} + +template +auto convert_to_host(const cudf::column_view& buffer) +{ + CHECK(buffer.type().id() == cudf::type_to_id()) << "Column has different type than requested"; + + std::vector host_buffer(buffer.size()); + + MRC_CHECK_CUDA(cudaMemcpy(host_buffer.data(), buffer.data(), buffer.size() * sizeof(T), cudaMemcpyDeviceToHost)); + + return host_buffer; +} + +template +void assert_eq_device_to_host(const rmm::device_buffer& device, const std::vector& host) +{ + std::vector device_on_host = convert_to_host(device); + + ASSERT_EQ(device_on_host, host); +} + +template +void assert_eq_device_to_host(const cudf::column_view& device, const std::vector& host) +{ + std::vector device_on_host = convert_to_host(device); + + ASSERT_EQ(device_on_host, host); +} + +template +void assert_eq_device_to_device(const cudf::column_view& device1, const cudf::column_view& device2) +{ + ASSERT_EQ(device1.size(), device2.size()) << "Columns have different sizes"; + ASSERT_EQ(device1.type(), device2.type()) << "Columns have different types"; + + std::vector device1_on_host = convert_to_host(device1); + std::vector device2_on_host = convert_to_host(device2); + + ASSERT_EQ(device1_on_host, device2_on_host); +} + +} // namespace morpheus::test diff --git a/morpheus/cli/run.py b/morpheus/cli/run.py index afe64da93e..ebd2b3c932 100644 --- a/morpheus/cli/run.py +++ b/morpheus/cli/run.py @@ -16,7 +16,7 @@ def run_cli(): """Main entrypoint for the CLI""" - from morpheus.cli.commands import cli + from morpheus.cli.commands import cli # pylint: disable=cyclic-import cli(obj={}, auto_envvar_prefix='MORPHEUS', show_default=True, prog_name="morpheus") diff --git a/morpheus/controllers/serialize_controller.py b/morpheus/controllers/serialize_controller.py index 9c6f1bdf69..dd653b8325 100644 --- a/morpheus/controllers/serialize_controller.py +++ b/morpheus/controllers/serialize_controller.py @@ -16,6 +16,7 @@ import re import typing +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage @@ -62,7 +63,7 @@ def fixed_columns(self): return self._fixed_columns def convert_to_df(self, - x: MultiMessage, + x: typing.Union[MultiMessage, ControlMessage], include_columns: typing.Pattern, exclude_columns: typing.List[typing.Pattern]): """ @@ -70,8 +71,8 @@ def convert_to_df(self, Parameters ---------- - x : `morpheus.pipeline.messages.MultiMessage` - MultiMessage instance that contains data. + x : `morpheus.pipeline.messages.MultiMessage` or `morpheus.pipeline.messages.ControlMessage` + MultiMessage or ControlMessage instance that contains data. include_columns : typing.Pattern Columns that are required send to downstream stage. exclude_columns : typing.List[typing.Pattern] @@ -85,7 +86,10 @@ def convert_to_df(self, columns: typing.List[str] = [] # Minimize access to x.meta.df - df_columns = list(x.meta.df.columns) + if isinstance(x, MultiMessage): + df_columns = list(x.meta.df.columns) + elif isinstance(x, ControlMessage): + df_columns = list(x.payload().get_column_names()) # First build up list of included. If no include regex is specified, select all if (include_columns is None): @@ -100,7 +104,10 @@ def convert_to_df(self, self._columns = columns # Get metadata from columns - df = x.get_meta(columns) + if isinstance(x, MultiMessage): + df = x.get_meta(self._columns) + elif isinstance(x, ControlMessage): + df = x.payload().get_data(columns) return MessageMeta(df=df) diff --git a/morpheus/llm/nodes/extracter_node.py b/morpheus/llm/nodes/extracter_node.py index 13a0907f26..b6ad8c5e0a 100644 --- a/morpheus/llm/nodes/extracter_node.py +++ b/morpheus/llm/nodes/extracter_node.py @@ -33,7 +33,7 @@ def get_input_names(self) -> list[str]: # This node does not receive its inputs from upstream nodes, but rather from the task itself return [] - async def execute(self, context: LLMContext) -> LLMContext: + async def execute(self, context: LLMContext) -> LLMContext: # pylint: disable=invalid-overridden-method # Get the keys from the task input_keys: list[str] = typing.cast(list[str], context.task()["input_keys"]) diff --git a/morpheus/llm/nodes/llm_generate_node.py b/morpheus/llm/nodes/llm_generate_node.py index 55f86063fd..08a5198078 100644 --- a/morpheus/llm/nodes/llm_generate_node.py +++ b/morpheus/llm/nodes/llm_generate_node.py @@ -43,7 +43,7 @@ def __init__(self, llm_client: LLMClient) -> None: def get_input_names(self) -> list[str]: return self._llm_client.get_input_names() - async def execute(self, context: LLMContext) -> LLMContext: + async def execute(self, context: LLMContext) -> LLMContext: # pylint: disable=invalid-overridden-method # Get the inputs inputs: dict[str, list[str]] = context.get_inputs() diff --git a/morpheus/llm/nodes/prompt_template_node.py b/morpheus/llm/nodes/prompt_template_node.py index 12903a795e..65a834b3c5 100644 --- a/morpheus/llm/nodes/prompt_template_node.py +++ b/morpheus/llm/nodes/prompt_template_node.py @@ -66,7 +66,7 @@ def __init__(self, template: str, template_format: typing.Literal["f-string", "j def get_input_names(self): return self._input_names - async def execute(self, context: LLMContext): + async def execute(self, context: LLMContext): # pylint: disable=invalid-overridden-method # Get the keys from the task input_dict = context.get_inputs() diff --git a/morpheus/llm/nodes/retriever_node.py b/morpheus/llm/nodes/retriever_node.py index 47e4eee7cb..9d8df4109c 100644 --- a/morpheus/llm/nodes/retriever_node.py +++ b/morpheus/llm/nodes/retriever_node.py @@ -63,7 +63,7 @@ def get_input_names(self) -> list[str]: return ["query"] - async def execute(self, context: LLMContext): + async def execute(self, context: LLMContext): # pylint: disable=invalid-overridden-method """ Execute the retrieval process based on the provided context. diff --git a/morpheus/llm/task_handlers/simple_task_handler.py b/morpheus/llm/task_handlers/simple_task_handler.py index 294eda4681..8f225581b3 100644 --- a/morpheus/llm/task_handlers/simple_task_handler.py +++ b/morpheus/llm/task_handlers/simple_task_handler.py @@ -43,6 +43,7 @@ def __init__(self, output_columns: list[str] = None) -> None: def get_input_names(self) -> list[str]: return self._output_columns + # pylint: disable=invalid-overridden-method async def try_handle(self, context: LLMContext) -> list[ControlMessage]: input_dict = context.get_inputs() diff --git a/morpheus/modules/file_batcher.py b/morpheus/modules/file_batcher.py index 214f6c0e5b..05f0288096 100644 --- a/morpheus/modules/file_batcher.py +++ b/morpheus/modules/file_batcher.py @@ -157,10 +157,10 @@ def build_period_batches(files: typing.List[str], sampling = f"{sampling_rate_s}S" if (start_time is not None): - start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc) + start_time = datetime.datetime.fromisoformat(start_time).replace(tzinfo=datetime.timezone.utc) if (end_time is not None): - end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc) + end_time = datetime.datetime.fromisoformat(end_time).replace(tzinfo=datetime.timezone.utc) except Exception as exec_info: logger.error("Error parsing parameters: %s", (exec_info)) diff --git a/morpheus/modules/output/write_to_vector_db.py b/morpheus/modules/output/write_to_vector_db.py index a83f254b8e..c141aef7c6 100644 --- a/morpheus/modules/output/write_to_vector_db.py +++ b/morpheus/modules/output/write_to_vector_db.py @@ -132,6 +132,7 @@ def _write_to_vector_db(builder: mrc.Builder): write_time_interval = write_to_vdb_config.write_time_interval # Check if service is serialized and convert if needed + # pylint: disable=not-a-mapping service: VectorDBService = (pickle.loads(bytes(service, "latin1")) if is_service_serialized else VectorDBServiceFactory.create_instance(service_name=service, **service_kwargs)) @@ -210,6 +211,8 @@ def on_data(msg: typing.Union[ControlMessage, MultiResponseMessage, MultiMessage >= write_time_interval): if accum_stats.data: merged_df = cudf.concat(accum_stats.data) + + # pylint: disable=not-a-mapping service.insert_dataframe(name=key, df=merged_df, **resource_kwargs) # Reset accumulator stats accum_stats.data.clear() diff --git a/morpheus/pipeline/__init__.py b/morpheus/pipeline/__init__.py index 3df9d30f23..169bddafe1 100644 --- a/morpheus/pipeline/__init__.py +++ b/morpheus/pipeline/__init__.py @@ -15,9 +15,25 @@ All objects related to building and running a pipeline. """ +# Note: The pipeline module is unique in that we re-export all of the classes and functions from the submodules. To +# avoid circular imports, we must import the classes in a specific order. And in each submodule, we should never import +# the from pipeline submodules. Instead, we should import from the parent module as a namespace packag and then use the +# fully qualified name to access the classes. For example, in morpheus/pipeline/stage.py: +# Do not do this: +# ``` +# from morpheus.pipeline.stage_base import StageBase +# ``` +# Instead, do this: +# ``` +# import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import +# class Stage(_pipeline.StageBase): +# ``` + # These must be imported in a specific order # isort: off +from morpheus.pipeline.boundary_stage_mixin import BoundaryStageMixin +from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.stage_schema import PortSchema from morpheus.pipeline.stage_schema import StageSchema from morpheus.pipeline.sender import Sender diff --git a/morpheus/pipeline/linear_pipeline.py b/morpheus/pipeline/linear_pipeline.py index c47f5e029d..7b8a4a767c 100644 --- a/morpheus/pipeline/linear_pipeline.py +++ b/morpheus/pipeline/linear_pipeline.py @@ -15,10 +15,8 @@ import logging import typing -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config -from morpheus.stages.boundary.linear_boundary_stage import LinearBoundaryEgressStage -from morpheus.stages.boundary.linear_boundary_stage import LinearBoundaryIngressStage SinglePortStageT = typing.TypeVar("SinglePortStageT", bound=_pipeline.SinglePortStage) SourceT = typing.TypeVar("SourceT", bound=_pipeline.SourceStage) @@ -138,6 +136,12 @@ def add_segment_boundary(self, data_type=None, as_shared_pointer=False): >>> >>> pipe.run() """ + + # Local imports to avoid circular dependencies + # pylint:disable=cyclic-import + from morpheus.stages.boundary.linear_boundary_stage import LinearBoundaryEgressStage + from morpheus.stages.boundary.linear_boundary_stage import LinearBoundaryIngressStage + assert as_shared_pointer is False, "Shared pointers are not currently supported" if (len(self._linear_stages) == 0): diff --git a/morpheus/pipeline/multi_message_stage.py b/morpheus/pipeline/multi_message_stage.py index eba98a8a21..a5a45670e8 100644 --- a/morpheus/pipeline/multi_message_stage.py +++ b/morpheus/pipeline/multi_message_stage.py @@ -18,7 +18,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MultiMessage diff --git a/morpheus/pipeline/pass_thru_type_mixin.py b/morpheus/pipeline/pass_thru_type_mixin.py index 7db5554813..5473572180 100644 --- a/morpheus/pipeline/pass_thru_type_mixin.py +++ b/morpheus/pipeline/pass_thru_type_mixin.py @@ -14,7 +14,7 @@ """Mixin for single port stages which receive and emit the same type.""" from abc import ABC -from morpheus.pipeline.stage_schema import StageSchema +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import class PassThruTypeMixin(ABC): @@ -23,6 +23,6 @@ class PassThruTypeMixin(ABC): `typing.Any`, and who's output type is inferred from the output types of the parent stages. """ - def compute_schema(self, schema: StageSchema): + def compute_schema(self, schema: _pipeline.StageSchema): for (port_idx, port_schema) in enumerate(schema.input_schemas): schema.output_schemas[port_idx].set_type(port_schema.get_type()) diff --git a/morpheus/pipeline/pipeline.py b/morpheus/pipeline/pipeline.py index df40264bf4..180a9a2188 100644 --- a/morpheus/pipeline/pipeline.py +++ b/morpheus/pipeline/pipeline.py @@ -28,19 +28,13 @@ import networkx from tqdm import tqdm +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config -from morpheus.pipeline.boundary_stage_mixin import BoundaryStageMixin -from morpheus.pipeline.preallocator_mixin import PreallocatorMixin -from morpheus.pipeline.receiver import Receiver -from morpheus.pipeline.sender import Sender -from morpheus.pipeline.source_stage import SourceStage -from morpheus.pipeline.stage import Stage -from morpheus.pipeline.stage_base import StageBase from morpheus.utils.type_utils import pretty_print_type_name logger = logging.getLogger(__name__) -StageT = typing.TypeVar("StageT", bound=StageBase) +StageT = typing.TypeVar("StageT", bound=_pipeline.StageBase) class PipelineState(Enum): @@ -75,10 +69,10 @@ def __init__(self, config: Config): self._num_threads = config.num_threads # Complete set of nodes across segments in this pipeline - self._stages: typing.List[Stage] = [] + self._stages: typing.List[_pipeline.Stage] = [] # Complete set of sources across segments in this pipeline - self._sources: typing.List[SourceStage] = [] + self._sources: typing.List[_pipeline.SourceStage] = [] # Dictionary containing segment information for this pipeline self._segments: typing.Dict = defaultdict(lambda: {"nodes": set(), "ingress_ports": [], "egress_ports": []}) @@ -123,10 +117,10 @@ def add_stage(self, stage: StageT, segment_id: str = "main") -> StageT: segment_graph = self._segment_graphs[segment_id] # Add to list of stages if it's a stage, not a source - if (isinstance(stage, Stage)): + if (isinstance(stage, _pipeline.Stage)): segment_nodes.add(stage) self._stages.append(stage) - elif (isinstance(stage, SourceStage)): + elif (isinstance(stage, _pipeline.SourceStage)): segment_nodes.add(stage) self._sources.append(stage) else: @@ -139,8 +133,8 @@ def add_stage(self, stage: StageT, segment_id: str = "main") -> StageT: return stage def add_edge(self, - start: typing.Union[StageBase, Sender], - end: typing.Union[Stage, Receiver], + start: typing.Union[_pipeline.StageBase, _pipeline.Sender], + end: typing.Union[_pipeline.Stage, _pipeline.Receiver], segment_id: str = "main"): """ Create an edge between two stages and add it to a segment in the pipeline. @@ -159,7 +153,7 @@ def add_edge(self, """ self._assert_not_built() - if (isinstance(start, StageBase)): + if (isinstance(start, _pipeline.StageBase)): assert len(start.output_ports) > 0, \ "Cannot call `add_edge` with a stage with no output ports as the `start` parameter" assert len(start.output_ports) == 1, \ @@ -167,10 +161,10 @@ def add_edge(self, "instead `add_edge` must be called for each output port individually.") start_port = start.output_ports[0] - elif (isinstance(start, Sender)): + elif (isinstance(start, _pipeline.Sender)): start_port = start - if (isinstance(end, Stage)): + if (isinstance(end, _pipeline.Stage)): assert len(end.input_ports) > 0, \ "Cannot call `add_edge` with a stage with no input ports as the `end` parameter" assert len(end.input_ports) == 1, \ @@ -178,7 +172,7 @@ def add_edge(self, "instead `add_edge` must be called for each input port individually.") end_port = end.input_ports[0] - elif (isinstance(end, Receiver)): + elif (isinstance(end, _pipeline.Receiver)): end_port = end start_port._output_receivers.append(end_port) @@ -191,9 +185,9 @@ def add_edge(self, end_port_idx=end_port.port_number) def add_segment_edge(self, - egress_stage: BoundaryStageMixin, + egress_stage: _pipeline.BoundaryStageMixin, egress_segment: str, - ingress_stage: BoundaryStageMixin, + ingress_stage: _pipeline.BoundaryStageMixin, ingress_segment: str, port_pair: typing.Union[str, typing.Tuple[str, typing.Type, bool]]): """ @@ -221,7 +215,7 @@ def add_segment_edge(self, * bool: If the type is a shared pointer (typically should be `False`) """ self._assert_not_built() - assert isinstance(egress_stage, BoundaryStageMixin), "Egress stage must be a BoundaryStageMixin" + assert isinstance(egress_stage, _pipeline.BoundaryStageMixin), "Egress stage must be a BoundaryStageMixin" egress_edges = self._segments[egress_segment]["egress_ports"] egress_edges.append({ "port_pair": port_pair, @@ -230,7 +224,7 @@ def add_segment_edge(self, "receiver_segment": ingress_segment }) - assert isinstance(ingress_stage, BoundaryStageMixin), "Ingress stage must be a BoundaryStageMixin" + assert isinstance(ingress_stage, _pipeline.BoundaryStageMixin), "Ingress stage must be a BoundaryStageMixin" ingress_edges = self._segments[ingress_segment]["ingress_ports"] ingress_edges.append({ "port_pair": port_pair, @@ -256,7 +250,7 @@ def _pre_build(self): # topo_sort provides a reasonable approximation. for stage in networkx.topological_sort(segment_graph): needed_columns.update(stage.get_needed_columns()) - if (isinstance(stage, PreallocatorMixin)): + if (isinstance(stage, _pipeline.PreallocatorMixin)): preallocator_stages.append(stage) if (stage.can_pre_build()): @@ -278,7 +272,7 @@ def _pre_build(self): # Finally, execute the link phase (only necessary for circular pipelines) # for s in source_and_stages: for stage in segment_graph.nodes(): - for port in typing.cast(StageBase, stage).input_ports: + for port in typing.cast(_pipeline.StageBase, stage).input_ports: port.link_schema() logger.info("====Pre-Building Segment Complete!====") @@ -334,7 +328,7 @@ def inner_build(builder: mrc.Builder, segment_id: str): # Finally, execute the link phase (only necessary for circular pipelines) for stage in segment_graph.nodes(): - for port in typing.cast(StageBase, stage).input_ports: + for port in typing.cast(_pipeline.StageBase, stage).input_ports: port.link_node(builder=builder) # Call the start method for the stages in this segment. Must run on the loop and wait for the result @@ -512,7 +506,7 @@ def visualize(self, filename: str = None, **graph_kwargs): start_def_port = ":e" if is_lr else ":s" end_def_port = ":w" if is_lr else ":n" - def has_ports(node: StageBase, is_input): + def has_ports(node: _pipeline.StageBase, is_input): if (is_input): return len(node.input_ports) > 0 @@ -523,7 +517,7 @@ def has_ports(node: StageBase, is_input): gv_subgraphs[segment_id] = graphviz.Digraph(f"cluster_{segment_id}") gv_subgraph = gv_subgraphs[segment_id] gv_subgraph.attr(label=segment_id) - for name, attrs in typing.cast(typing.Mapping[StageBase, dict], + for name, attrs in typing.cast(typing.Mapping[_pipeline.StageBase, dict], self._segment_graphs[segment_id].nodes).items(): node_attrs = attrs.copy() @@ -562,7 +556,7 @@ def has_ports(node: StageBase, is_input): # Build up edges for segment_id in self._segments: gv_subgraph = gv_subgraphs[segment_id] - for e, attrs in typing.cast(typing.Mapping[typing.Tuple[StageBase, StageBase], dict], + for e, attrs in typing.cast(typing.Mapping[typing.Tuple[_pipeline.StageBase, _pipeline.StageBase], dict], self._segment_graphs[segment_id].edges()).items(): # noqa: E501 edge_attrs = {} diff --git a/morpheus/pipeline/receiver.py b/morpheus/pipeline/receiver.py index b0f2637851..fcc7d3f30f 100644 --- a/morpheus/pipeline/receiver.py +++ b/morpheus/pipeline/receiver.py @@ -17,7 +17,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.utils.type_utils import greatest_ancestor logger = logging.getLogger(__name__) diff --git a/morpheus/pipeline/sender.py b/morpheus/pipeline/sender.py index c58bc1a347..701cac3250 100644 --- a/morpheus/pipeline/sender.py +++ b/morpheus/pipeline/sender.py @@ -17,7 +17,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import logger = logging.getLogger(__name__) diff --git a/morpheus/pipeline/single_output_source.py b/morpheus/pipeline/single_output_source.py index 9b552898e3..c9bd1fd826 100644 --- a/morpheus/pipeline/single_output_source.py +++ b/morpheus/pipeline/single_output_source.py @@ -18,7 +18,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.utils.type_utils import pretty_print_type_name diff --git a/morpheus/pipeline/single_port_stage.py b/morpheus/pipeline/single_port_stage.py index 7c5471c048..b9ea20aeeb 100644 --- a/morpheus/pipeline/single_port_stage.py +++ b/morpheus/pipeline/single_port_stage.py @@ -19,7 +19,7 @@ import mrc import typing_utils -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.utils.type_utils import pretty_print_type_name diff --git a/morpheus/pipeline/source_stage.py b/morpheus/pipeline/source_stage.py index 6d8f4f23c5..2778cf6590 100644 --- a/morpheus/pipeline/source_stage.py +++ b/morpheus/pipeline/source_stage.py @@ -18,7 +18,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config logger = logging.getLogger(__name__) diff --git a/morpheus/pipeline/stage.py b/morpheus/pipeline/stage.py index c9c03b65e4..7c0bb0475f 100644 --- a/morpheus/pipeline/stage.py +++ b/morpheus/pipeline/stage.py @@ -16,7 +16,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import logger = logging.getLogger(__name__) diff --git a/morpheus/pipeline/stage_base.py b/morpheus/pipeline/stage_base.py index c71146a060..3aa3b2f450 100644 --- a/morpheus/pipeline/stage_base.py +++ b/morpheus/pipeline/stage_base.py @@ -24,7 +24,7 @@ import mrc -import morpheus.pipeline as _pipeline +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.config import CppConfig from morpheus.utils.atomic_integer import AtomicInteger @@ -99,6 +99,9 @@ def __init__(self, config: Config): # Mapping of {`column_name`: `TyepId`} self._needed_columns = collections.OrderedDict() + # Schema of the stage + self._schema = _pipeline.StageSchema(self) + def __init_subclass__(cls) -> None: # Wrap __init__ to save the arg values @@ -345,14 +348,15 @@ def _pre_build(self, do_propagate: bool = True): schema = _pipeline.StageSchema(self) self._pre_compute_schema(schema) self.compute_schema(schema) + self._schema = schema - assert len(schema.output_schemas) == len(self.output_ports), \ + assert len(self._schema.output_schemas) == len(self.output_ports), \ (f"Prebuild expected `schema.output_schemas` to be of length {len(self.output_ports)} " - f"(one for each output port), but got {len(schema.output_schemas)}.") + f"(one for each output port), but got {len(self._schema.output_schemas)}.") - schema._complete() + self._schema._complete() - for (port_idx, port_schema) in enumerate(schema.output_schemas): + for (port_idx, port_schema) in enumerate(self._schema.output_schemas): self.output_ports[port_idx].output_schema = port_schema self._is_pre_built = True diff --git a/morpheus/pipeline/stage_decorator.py b/morpheus/pipeline/stage_decorator.py index b7ca6e6f9d..60531260e6 100644 --- a/morpheus/pipeline/stage_decorator.py +++ b/morpheus/pipeline/stage_decorator.py @@ -24,18 +24,15 @@ import cudf +import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.common import TypeId from morpheus.config import Config from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage -from morpheus.pipeline.preallocator_mixin import PreallocatorMixin -from morpheus.pipeline.single_output_source import SingleOutputSource -from morpheus.pipeline.single_port_stage import SinglePortStage -from morpheus.pipeline.stage_schema import StageSchema logger = logging.getLogger(__name__) GeneratorType = typing.Callable[..., collections.abc.Iterator[typing.Any]] -ComputeSchemaType = typing.Callable[[StageSchema], None] +ComputeSchemaType = typing.Callable[[_pipeline.StageSchema], None] def _get_name_from_fn(fn: typing.Callable) -> str: @@ -71,7 +68,7 @@ def _validate_keyword_arguments(fn_name: str, f"{fn_name} contains '{param.name}' that was not provided with a value") -class WrappedFunctionSourceStage(SingleOutputSource): +class WrappedFunctionSourceStage(_pipeline.SingleOutputSource): """ Source stage that wraps a generator function as the method for generating messages. @@ -109,14 +106,14 @@ def name(self) -> str: def supports_cpp_node(self) -> bool: return False - def compute_schema(self, schema: StageSchema): + def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: return builder.make_source(self.unique_name, self._gen_fn) -class PreAllocatedWrappedFunctionStage(PreallocatorMixin, WrappedFunctionSourceStage): +class PreAllocatedWrappedFunctionStage(_pipeline.PreallocatorMixin, WrappedFunctionSourceStage): """ Source stage that wraps a generator function as the method for generating messages. @@ -184,11 +181,13 @@ def wrapper(config: Config, **kwargs) -> WrappedFunctionSourceStage: if isinstance(return_type, (typing.GenericAlias, typing._GenericAlias)): return_type = return_type.__args__[0] - if compute_schema_fn is None: # pylint: disable=used-before-assignment + if compute_schema_fn is None: - def compute_schema_fn(schema: StageSchema): + def compute_schema_fn_inner(schema: _pipeline.StageSchema): schema.output_schema.set_type(return_type) + compute_schema_fn = compute_schema_fn_inner + _validate_keyword_arguments(name, signature, kwargs, param_iter=iter(signature.parameters.values())) bound_gen_fn = functools.partial(gen_fn, **kwargs) @@ -209,7 +208,7 @@ def compute_schema_fn(schema: StageSchema): return wrapper -class WrappedFunctionStage(SinglePortStage): +class WrappedFunctionStage(_pipeline.SinglePortStage): """ Stage that wraps a function to be used for processing messages. @@ -262,7 +261,7 @@ def accepted_types(self) -> typing.Tuple: def supports_cpp_node(self) -> bool: return False - def compute_schema(self, schema: StageSchema): + def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: @@ -345,7 +344,7 @@ def wrapper(config: Config, **kwargs) -> WrappedFunctionStage: raise ValueError( "Stage functions must have either a return type annotation or specify a compute_schema_fn") - def compute_schema_fn(schema: StageSchema): + def compute_schema_fn_inner(schema: _pipeline.StageSchema): if return_type is typing.Any: out_type = schema.input_schema.get_type() else: @@ -353,6 +352,8 @@ def compute_schema_fn(schema: StageSchema): schema.output_schema.set_type(out_type) + compute_schema_fn = compute_schema_fn_inner + _validate_keyword_arguments(name, signature, kwargs, param_iter=param_iter) bound_on_data_fn = functools.partial(on_data_fn, **kwargs) diff --git a/morpheus/stages/postprocess/add_classifications_stage.py b/morpheus/stages/postprocess/add_classifications_stage.py index 8c7544ec78..40e37f264f 100644 --- a/morpheus/stages/postprocess/add_classifications_stage.py +++ b/morpheus/stages/postprocess/add_classifications_stage.py @@ -20,6 +20,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.stages.postprocess.add_scores_stage_base import AddScoresStageBase logger = logging.getLogger(__name__) @@ -69,4 +70,13 @@ def supports_cpp_node(self): def _get_cpp_node(self, builder: mrc.Builder): import morpheus._lib.stages as _stages - return _stages.AddClassificationsStage(builder, self.unique_name, self._idx2label, self._threshold) + if (self._schema.input_type == ControlMessage): + return _stages.AddClassificationsControlMessageStage(builder, + self.unique_name, + self._idx2label, + self._threshold) + + return _stages.AddClassificationsMultiResponseMessageStage(builder, + self.unique_name, + self._idx2label, + self._threshold) diff --git a/morpheus/stages/postprocess/add_scores_stage.py b/morpheus/stages/postprocess/add_scores_stage.py index a9a325c199..3d83866052 100644 --- a/morpheus/stages/postprocess/add_scores_stage.py +++ b/morpheus/stages/postprocess/add_scores_stage.py @@ -20,6 +20,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.stages.postprocess.add_scores_stage_base import AddScoresStageBase logger = logging.getLogger(__name__) @@ -66,4 +67,7 @@ def supports_cpp_node(self): def _get_cpp_node(self, builder: mrc.Builder): import morpheus._lib.stages as _stages - return _stages.AddScoresStage(builder, self.unique_name, self._idx2label) + if (self._schema.input_type == ControlMessage): + return _stages.AddScoresControlMessageStage(builder, self.unique_name, self._idx2label) + + return _stages.AddScoresMultiResponseMessageStage(builder, self.unique_name, self._idx2label) diff --git a/morpheus/stages/postprocess/add_scores_stage_base.py b/morpheus/stages/postprocess/add_scores_stage_base.py index f437a41bfc..40320d26a1 100644 --- a/morpheus/stages/postprocess/add_scores_stage_base.py +++ b/morpheus/stages/postprocess/add_scores_stage_base.py @@ -22,6 +22,7 @@ from morpheus.common import TypeId from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages import MultiResponseMessage from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -90,14 +91,13 @@ def accepted_types(self) -> typing.Tuple: Accepted input types. """ - return (MultiResponseMessage, ) + return (MultiResponseMessage, ControlMessage) @abstractmethod def _get_cpp_node(self, builder: mrc.Builder): pass def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - # Convert the messages to rows of strings if self._build_cpp_node(): node = self._get_cpp_node(builder=builder) @@ -111,9 +111,49 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> # Return input type unchanged return node + @typing.overload + @staticmethod + def _add_labels(x: MultiResponseMessage, idx2label: dict[int, str], + threshold: typing.Optional[float]) -> MultiResponseMessage: + ... + + @typing.overload @staticmethod - def _add_labels(x: MultiResponseMessage, idx2label: typing.Dict[int, str], threshold: typing.Optional[float]): + def _add_labels(x: ControlMessage, idx2label: dict[int, str], threshold: typing.Optional[float]) -> ControlMessage: + ... + + @staticmethod + def _add_labels(x: MultiResponseMessage | ControlMessage, + idx2label: dict[int, str], + threshold: typing.Optional[float]): + if isinstance(x, ControlMessage): + return AddScoresStageBase.process_control_message(x, idx2label, threshold) + if isinstance(x, MultiResponseMessage): + return AddScoresStageBase.process_multi_message(x, idx2label, threshold) + raise TypeError("Unsupported message type") + + @staticmethod + def process_control_message(x: ControlMessage, idx2label: typing.Dict[int, str], threshold: typing.Optional[float]): + probs = x.tensors().get_tensor("probs") + + if (probs.shape[1] <= max(idx2label.keys())): + raise RuntimeError(("Model output did not contain enough columns to fufill the requested labels. " + f"Label indexes: {idx2label}, Model output columns: {probs.shape[1]}")) + if (threshold is not None): + probs = (probs > threshold).astype(bool) + + # Do these one at a time to prevent failures + for i, label in idx2label.items(): + x.payload().set_data(label, probs[:, i]) + + # Return the same object + return x + + @staticmethod + def process_multi_message(x: MultiResponseMessage, + idx2label: typing.Dict[int, str], + threshold: typing.Optional[float]): probs = x.get_probs_tensor() if (probs.shape[1] <= max(idx2label.keys())): diff --git a/morpheus/stages/postprocess/serialize_stage.py b/morpheus/stages/postprocess/serialize_stage.py index b3b7d9bea1..8262e1b4e1 100644 --- a/morpheus/stages/postprocess/serialize_stage.py +++ b/morpheus/stages/postprocess/serialize_stage.py @@ -23,6 +23,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.serialize_controller import SerializeController +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage from morpheus.pipeline.single_port_stage import SinglePortStage @@ -76,11 +77,11 @@ def accepted_types(self) -> typing.Tuple: Returns ------- - typing.Tuple(`morpheus.pipeline.messages.MultiMessage`, ) + typing.Tuple(`morpheus.pipeline.messages.MultiMessage`, `morpheus.pipeline.messages.ControlMessage`) Accepted input types. """ - return (MultiMessage, ) + return (MultiMessage, ControlMessage) def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(MessageMeta) @@ -91,11 +92,18 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: if (self._build_cpp_node()): - node = _stages.SerializeStage(builder, - self.unique_name, - self._controller.include_columns or [], - self._controller.exclude_columns, - self._controller.fixed_columns) + if (self._schema.input_type == ControlMessage): + node = _stages.SerializeControlMessageStage(builder, + self.unique_name, + self._controller.include_columns or [], + self._controller.exclude_columns, + self._controller.fixed_columns) + else: + node = _stages.SerializeMultiMessageStage(builder, + self.unique_name, + self._controller.include_columns or [], + self._controller.exclude_columns, + self._controller.fixed_columns) else: include_columns = self._controller.get_include_col_pattern() exclude_columns = self._controller.get_exclude_col_pattern() diff --git a/morpheus/stages/preprocess/preprocess_base_stage.py b/morpheus/stages/preprocess/preprocess_base_stage.py index 56d44f8166..3731912026 100644 --- a/morpheus/stages/preprocess/preprocess_base_stage.py +++ b/morpheus/stages/preprocess/preprocess_base_stage.py @@ -44,6 +44,7 @@ def __init__(self, c: Config): self._preprocess_fn = None self._should_log_timestamps = True + self._use_control_message = False def accepted_types(self) -> typing.Tuple: """ @@ -57,10 +58,14 @@ def accepted_types(self) -> typing.Tuple: def compute_schema(self, schema: StageSchema): out_type = MultiInferenceMessage + if (schema.input_type == ControlMessage): + self._use_control_message = True + out_type = ControlMessage + else: + self._use_control_message = False self._preprocess_fn = self._get_preprocess_fn() preproc_sig = inspect.signature(self._preprocess_fn) - # If the innerfunction returns a type annotation, update the output type if (preproc_sig.return_annotation and typing_utils.issubtype(preproc_sig.return_annotation, MultiInferenceMessage)): diff --git a/morpheus/stages/preprocess/preprocess_fil_stage.py b/morpheus/stages/preprocess/preprocess_fil_stage.py index 6a06738cf5..45b1640d72 100644 --- a/morpheus/stages/preprocess/preprocess_fil_stage.py +++ b/morpheus/stages/preprocess/preprocess_fil_stage.py @@ -27,10 +27,12 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes +from morpheus.messages import ControlMessage from morpheus.messages import InferenceMemoryFIL from morpheus.messages import MultiInferenceFILMessage from morpheus.messages import MultiInferenceMessage from morpheus.messages import MultiMessage +from morpheus.messages import TensorMemory as CppTensorMemory from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage logger = logging.getLogger(__name__) @@ -84,12 +86,53 @@ def pre_process_batch(x: MultiMessage, fea_len: int, fea_cols: typing.List[str]) FIL inference message. """ + if isinstance(x, ControlMessage): + return PreprocessFILStage.process_control_message(x, fea_len, fea_cols) + if isinstance(x, MultiMessage): + return PreprocessFILStage.process_multi_message(x, fea_len, fea_cols) + raise TypeError(f"Unsupported message type: {type(x)}") + + @staticmethod + def process_control_message(x: ControlMessage, fea_len: int, fea_cols: typing.List[str]) -> ControlMessage: + try: + df: cudf.DataFrame = x.payload().get_data(fea_cols) + except KeyError: + logger.exception("Requested feature columns does not exist in the dataframe.", exc_info=True) + raise + + # Extract just the numbers from each feature col. Not great to operate on x.meta.df here but the operations will + # only happen once. + for col in fea_cols: + if (df[col].dtype == np.dtype(str) or df[col].dtype == np.dtype(object)): + # If the column is a string, parse the number + df[col] = df[col].str.extract(r"(\d+)", expand=False).astype("float32") + elif (df[col].dtype != np.float32): + # Convert to float32 + df[col] = df[col].astype("float32") + + if (isinstance(df, pd.DataFrame)): + df = cudf.from_pandas(df) + + # Convert the dataframe to cupy the same way cuml does + data = cp.asarray(df.to_cupy()) + + count = data.shape[0] + + seg_ids = cp.zeros((count, 3), dtype=cp.uint32) + seg_ids[:, 0] = cp.arange(0, count, dtype=cp.uint32) + seg_ids[:, 2] = fea_len - 1 + + x.tensors(CppTensorMemory(count=count, tensors={"input__0": data, "seq_ids": seg_ids})) + return x + + @staticmethod + def process_multi_message(x: MultiMessage, fea_len: int, fea_cols: typing.List[str]) -> MultiInferenceFILMessage: try: df = x.get_meta(fea_cols) except KeyError: - logger.exception("Cound not get metadat for columns.") - return None + logger.exception("Requested feature columns does not exist in the dataframe.", exc_info=True) + raise # Extract just the numbers from each feature col. Not great to operate on x.meta.df here but the operations will # only happen once. @@ -120,8 +163,14 @@ def pre_process_batch(x: MultiMessage, fea_len: int, fea_cols: typing.List[str]) return infer_message - def _get_preprocess_fn(self) -> typing.Callable[[MultiMessage], MultiInferenceMessage]: + def _get_preprocess_fn( + self + ) -> typing.Callable[[typing.Union[MultiMessage, ControlMessage]], + typing.Union[MultiInferenceMessage, ControlMessage]]: return partial(PreprocessFILStage.pre_process_batch, fea_len=self._fea_length, fea_cols=self.features) def _get_preprocess_node(self, builder: mrc.Builder): - return _stages.PreprocessFILStage(builder, self.unique_name, self.features) + if (self._use_control_message): + return _stages.PreprocessFILControlMessageStage(builder, self.unique_name, self.features) + + return _stages.PreprocessFILMultiMessageStage(builder, self.unique_name, self.features) diff --git a/morpheus/stages/preprocess/preprocess_nlp_stage.py b/morpheus/stages/preprocess/preprocess_nlp_stage.py index 8b45dafe37..feace923dc 100644 --- a/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -25,8 +25,6 @@ import cudf import morpheus._lib.stages as _stages -# pylint: disable=morpheus-incorrect-lib-from-import -from morpheus._lib.messages import TensorMemory as CppTensorMemory from morpheus.cli.register_stage import register_stage from morpheus.cli.utils import MorpheusRelativePath from morpheus.cli.utils import get_package_relative_file @@ -37,6 +35,7 @@ from morpheus.messages import MultiInferenceMessage from morpheus.messages import MultiInferenceNLPMessage from morpheus.messages import MultiMessage +from morpheus.messages import TensorMemory as CppTensorMemory from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage from morpheus.utils.cudf_subword_helper import tokenize_text_series @@ -214,7 +213,6 @@ def process_control_message(message: ControlMessage, })) message.set_metadata("inference_memory_params", {"inference_type": "nlp"}) - return message @staticmethod @@ -264,12 +262,23 @@ def _get_preprocess_fn( column=self._column) def _get_preprocess_node(self, builder: mrc.Builder): - return _stages.PreprocessNLPStage(builder, - self.unique_name, - self._vocab_hash_file, - self._seq_length, - self._truncation, - self._do_lower_case, - self._add_special_tokens, - self._stride, - self._column) + if (self._use_control_message): + return _stages.PreprocessNLPControlMessageStage(builder, + self.unique_name, + self._vocab_hash_file, + self._seq_length, + self._truncation, + self._do_lower_case, + self._add_special_tokens, + self._stride, + self._column) + + return _stages.PreprocessNLPMultiMessageStage(builder, + self.unique_name, + self._vocab_hash_file, + self._seq_length, + self._truncation, + self._do_lower_case, + self._add_special_tokens, + self._stride, + self._column) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 59ce19a6ba..f05d3cbe5b 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -54,7 +54,7 @@ def process_dataframe(df_in: typing.Union[pd.DataFrame, cudf.DataFrame], input_s """ - from morpheus.utils import schema_transforms + from morpheus.utils import schema_transforms # pylint: disable=cyclic-import return schema_transforms.process_dataframe(df_in, input_schema) diff --git a/morpheus/utils/directory_watcher.py b/morpheus/utils/directory_watcher.py index baaeb4acfc..3fe6274b44 100644 --- a/morpheus/utils/directory_watcher.py +++ b/morpheus/utils/directory_watcher.py @@ -205,7 +205,7 @@ def _generate_via_watcher(self): while True: try: - files, is_event = file_queue.get(timeout=self._batch_timeout) + files, is_event = file_queue.get(timeout=self._batch_timeout) # pylint: disable=unpacking-non-sequence if (is_event): # We may be getting files one at a time from the folder watcher, wait a bit diff --git a/tests/_utils/stages/conv_msg.py b/tests/_utils/stages/conv_msg.py index 31151e3f0e..aa5886c4f7 100755 --- a/tests/_utils/stages/conv_msg.py +++ b/tests/_utils/stages/conv_msg.py @@ -22,8 +22,10 @@ import cudf +import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages import MultiMessage from morpheus.messages import MultiResponseMessage from morpheus.messages import ResponseMemory @@ -31,12 +33,14 @@ from morpheus.pipeline.stage_schema import StageSchema -@register_stage("unittest-conv-msg", ignore_args=["expected_data"]) +@register_stage("unittest-conv-msg", ignore_args=["expected_data", "message_type"]) class ConvMsg(SinglePortStage): """ - Simple test stage to convert a MultiMessage to a MultiResponseProbsMessage + Simple test stage to convert a MultiMessage to a MultiResponseProbsMessage, + or a ControlMessage to a ControlMessage with probs tensor. Basically a cheap replacement for running an inference stage. + Setting `message_type` to determine the input type of the stage. Setting `expected_data` to a DataFrame will cause the probs array to by populated by the values in the DataFrame. Setting `expected_data` to `None` causes the probs array to be a copy of the incoming dataframe. Setting `columns` restricts the columns copied into probs to just the ones specified. @@ -50,12 +54,14 @@ def __init__(self, columns: typing.List[str] = None, order: str = 'K', probs_type: str = 'f4', - empty_probs: bool = False): + empty_probs: bool = False, + message_type: type[MultiResponseMessage] | type[ControlMessage] = MultiResponseMessage): super().__init__(c) if expected_data is not None: assert isinstance(expected_data, (pd.DataFrame, cudf.DataFrame)) + self._message_type = message_type self._expected_data = expected_data self._columns = columns self._order = order @@ -67,15 +73,18 @@ def name(self) -> str: return "test" def accepted_types(self) -> typing.Tuple: - return (MultiMessage, ) + return ( + MultiMessage, + ControlMessage, + ) def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(MultiResponseMessage) + schema.output_schema.set_type(self._message_type) def supports_cpp_node(self) -> bool: return False - def _conv_message(self, message: MultiMessage) -> MultiResponseMessage: + def _conv_message(self, message: MultiMessage | ControlMessage) -> MultiResponseMessage | ControlMessage: if self._expected_data is not None: if (isinstance(self._expected_data, cudf.DataFrame)): df = self._expected_data.copy(deep=True) @@ -83,16 +92,23 @@ def _conv_message(self, message: MultiMessage) -> MultiResponseMessage: df = cudf.DataFrame(self._expected_data) else: - if self._columns is not None: - df = message.get_meta(self._columns) + if (isinstance(message, MultiMessage)): + if (self._columns is None): + df = message.get_meta() + else: + df = message.get_meta(self._columns) else: - df = message.get_meta() + df: cudf.DataFrame = message.payload().get_data(self._columns) # type: ignore if self._empty_probs: probs = cp.zeros([len(df), 3], 'float') else: probs = cp.array(df.values, dtype=self._probs_type, copy=True, order=self._order) + if (isinstance(message, ControlMessage)): + message.tensors(_messages.TensorMemory(count=len(probs), tensors={'probs': probs})) + return message + memory = ResponseMemory(count=len(probs), tensors={'probs': probs}) return MultiResponseMessage.from_message(message, memory=memory) diff --git a/tests/examples/digital_fingerprinting/utils/test_config_generator.py b/tests/examples/digital_fingerprinting/utils/test_config_generator.py new file mode 100644 index 0000000000..40d4f37b67 --- /dev/null +++ b/tests/examples/digital_fingerprinting/utils/test_config_generator.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from datetime import datetime + +import pytest + +from morpheus.config import Config + + +@pytest.fixture(name="dfp_arg_parser") +def dfp_arg_parser_fixture(): + from dfp.utils.dfp_arg_parser import DFPArgParser + dfp_arg_parser = DFPArgParser(skip_user=["unittest-skip-user"], + only_user=["unittest-only-user"], + start_time=datetime(1993, 4, 5, 6, 7, 8), + log_level=logging.DEBUG, + cache_dir=".cache", + sample_rate_s="20", + duration="2days", + source="unittest", + tracking_uri="http://unittest", + silence_monitors=False, + mlflow_experiment_name_formatter="unittest-experiment", + mlflow_model_name_formatter="unittest-model", + train_users="unittest-train-users") + dfp_arg_parser.init() + yield dfp_arg_parser + + +@pytest.fixture(name="schema") +def schema_fixture(config: Config): + from dfp.utils.schema_utils import SchemaBuilder + schema_builder = SchemaBuilder(config, "duo") + yield schema_builder.build_schema() + + +def test_constructor(config: Config, dfp_arg_parser: "DFPArgParser", schema: "Schema"): # noqa: F821 + from dfp.utils.config_generator import ConfigGenerator + + config_generator = ConfigGenerator(config=config, dfp_arg_parser=dfp_arg_parser, schema=schema, encoding="latin1") + + assert config_generator._config is config + assert config_generator._dfp_arg_parser is dfp_arg_parser + assert config_generator._encoding == "latin1" + assert config_generator._start_time_str == "1993-04-05T06:07:08+00:00" + assert config_generator._end_time_str == "1993-04-07T06:07:08+00:00" diff --git a/tests/io/test_loader_registry.py b/tests/io/test_loader_registry.py index edc58fe9e4..cbeccf69e5 100644 --- a/tests/io/test_loader_registry.py +++ b/tests/io/test_loader_registry.py @@ -33,7 +33,7 @@ def test_loader_registry_contains(): loaders = DataLoaderRegistry.list() for loader in should_have: # Make sure all the loaders in the registry are in the list - assert (loader in loaders) + assert (loader in loaders) # pylint: disable=unsupported-membership-test # Make sure all the loaders in the list are contained in the registry assert (DataLoaderRegistry.contains(loader)) diff --git a/tests/llm/test_llm.py b/tests/llm/test_llm.py index 5033f9e036..608db37a8f 100644 --- a/tests/llm/test_llm.py +++ b/tests/llm/test_llm.py @@ -172,7 +172,7 @@ class SinkNode(LLMNodeBase): def get_input_names(self): return ["nested_answers", "answers"] - async def execute(self, context: LLMContext): + async def execute(self, context: LLMContext): # pylint: disable=invalid-overridden-method nested_answers = context.get_input("nested_answers") answers = context.get_input("answers") @@ -188,7 +188,7 @@ class SimpleTaskHandler(LLMTaskHandler): def get_input_names(self): return ["response"] - async def try_handle(self, context: LLMContext): + async def try_handle(self, context: LLMContext): # pylint: disable=invalid-overridden-method with context.message().payload().mutable_dataframe() as df: df["response"] = context.get_input() diff --git a/tests/modules/test_file_batcher.py b/tests/modules/test_file_batcher.py index 463b2fef21..fab99fdb48 100644 --- a/tests/modules/test_file_batcher.py +++ b/tests/modules/test_file_batcher.py @@ -54,8 +54,8 @@ def default_module_config_fixture(): "module_name": "file_batcher", "namespace": MORPHEUS_MODULE_NAMESPACE, "sampling_rate_s": 0, - "start_time": "2022-08-01", - "end_time": "2022-08-31", + "start_time": "2022-08-01T00:00:00", + "end_time": "2022-08-31T00:00:00", "parser_kwargs": None, "schema": { "schema_str": None, "encoding": None diff --git a/tests/modules/test_from_control_message.py b/tests/modules/test_from_control_message.py index 6eb0829fba..b129bbbcc8 100644 --- a/tests/modules/test_from_control_message.py +++ b/tests/modules/test_from_control_message.py @@ -68,7 +68,7 @@ def test_get_module(): assert fn_constructor is not None config = {} - fn_constructor("FromControlMessageTest", config) + fn_constructor("FromControlMessageTest", config) # pylint: disable=not-callable @pytest.mark.use_cpp diff --git a/tests/modules/test_morpheus_modules.py b/tests/modules/test_morpheus_modules.py index 62765d851d..13a5fe56cd 100644 --- a/tests/modules/test_morpheus_modules.py +++ b/tests/modules/test_morpheus_modules.py @@ -66,7 +66,7 @@ def test_get_module(): assert fn_constructor is not None config = {} - # pylint: disable=unused-variable + # pylint: disable=unused-variable,not-callable module_instance = fn_constructor("ModuleDataLoaderTest", config) # noqa: F841 -- we don't need to use it diff --git a/tests/modules/test_payload_batcher.py b/tests/modules/test_payload_batcher.py index 47f43849d7..02acd6b8ee 100644 --- a/tests/modules/test_payload_batcher.py +++ b/tests/modules/test_payload_batcher.py @@ -79,7 +79,7 @@ def test_get_module(): assert fn_constructor is not None config = {} - module_instance = fn_constructor("PayloadBatcherTest", config) + module_instance = fn_constructor("PayloadBatcherTest", config) # pylint: disable=not-callable assert isinstance(module_instance, mrc.core.segment.SegmentModule) diff --git a/tests/modules/test_to_control_message.py b/tests/modules/test_to_control_message.py index 4b66ae91e3..96f91a2fee 100644 --- a/tests/modules/test_to_control_message.py +++ b/tests/modules/test_to_control_message.py @@ -57,7 +57,7 @@ def test_get_module(): assert fn_constructor is not None config = {} - module_instance = fn_constructor("ToControlMessageTest", config) + module_instance = fn_constructor("ToControlMessageTest", config) # pylint: disable=not-callable assert isinstance(module_instance, mrc.core.segment.SegmentModule) diff --git a/tests/stages/test_preprocess_fil_stage.py b/tests/stages/test_preprocess_fil_stage.py new file mode 100644 index 0000000000..eb6dc8b620 --- /dev/null +++ b/tests/stages/test_preprocess_fil_stage.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cupy as cp +import pytest + +import cudf + +from morpheus.config import Config +from morpheus.config import ConfigFIL +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.messages import MultiMessage +from morpheus.stages.preprocess.preprocess_fil_stage import PreprocessFILStage + + +@pytest.fixture(name='config') +def fixture_config(config: Config): + config.feature_length = 1 + config.fil = ConfigFIL() + config.fil.feature_columns = ["data"] + yield config + + +def test_constructor(config: Config): + stage = PreprocessFILStage(config) + assert stage.name == "preprocess-fil" + assert stage._fea_length == config.feature_length + assert stage.features == config.fil.feature_columns + + accepted_types = stage.accepted_types() + assert isinstance(accepted_types, tuple) + assert len(accepted_types) > 0 + + +def test_process_control_message(config: Config): + stage = PreprocessFILStage(config) + input_cm = ControlMessage() + df = cudf.DataFrame({"data": [1, 2, 3]}) + meta = MessageMeta(df) + input_cm.payload(meta) + + output_cm = stage.pre_process_batch(input_cm, stage._fea_length, stage.features) + assert cp.array_equal(output_cm.tensors().get_tensor("input__0"), cp.asarray(df.to_cupy())) + expect_seg_ids = cp.zeros((df.shape[0], 3), dtype=cp.uint32) + expect_seg_ids[:, 0] = cp.arange(0, df.shape[0], dtype=cp.uint32) + expect_seg_ids[:, 2] = stage._fea_length - 1 + assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), expect_seg_ids) + + +def test_process_multi_message(config: Config): + stage = PreprocessFILStage(config) + df = cudf.DataFrame({"data": [1, 2, 3]}) + meta = MessageMeta(df) + mess_offset = 0 + input_multi_message = MultiMessage(meta=meta, mess_offset=mess_offset, mess_count=3) + + output_infer_message = stage.pre_process_batch(input_multi_message, stage._fea_length, stage.features) + assert cp.array_equal(output_infer_message.input__0, cp.asarray(df.to_cupy())) + expect_seg_ids = cp.zeros((df.shape[0], 3), dtype=cp.uint32) + expect_seg_ids[:, 0] = cp.arange(0, df.shape[0], dtype=cp.uint32) + expect_seg_ids[:, 2] = stage._fea_length - 1 + assert cp.array_equal(output_infer_message.seq_ids, expect_seg_ids) + + +def test_process_control_message_and_multi_message(config: Config): + stage = PreprocessFILStage(config) + df = cudf.DataFrame({"data": [1, 2, 3]}) + meta = MessageMeta(df) + input_control_message = ControlMessage() + input_control_message.payload(meta) + + mess_offset = 0 + input_multi_message = MultiMessage(meta=meta, mess_offset=mess_offset, mess_count=3) + + output_control_message = stage.pre_process_batch(input_control_message, stage._fea_length, stage.features) + + output_infer_message = stage.pre_process_batch(input_multi_message, stage._fea_length, stage.features) + + # Check if each tensor in the control message is equal to the corresponding tensor in the inference message + for tensor_key in output_control_message.tensors().tensor_names: + assert cp.array_equal(output_control_message.tensors().get_tensor(tensor_key), + getattr(output_infer_message, tensor_key)) diff --git a/tests/stages/test_preprocess_nlp_stage.py b/tests/stages/test_preprocess_nlp_stage.py new file mode 100644 index 0000000000..9c2b5d4e39 --- /dev/null +++ b/tests/stages/test_preprocess_nlp_stage.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock +from unittest.mock import patch + +import cupy as cp +import pytest + +import cudf + +from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.messages import MultiMessage +from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage + + +@pytest.fixture(name='config') +def fixture_config(config: Config): + config.class_labels = [ + "address", + "bank_acct", + "credit_card", + "email", + "govt_id", + "name", + "password", + "phone_num", + "secret_keys", + "user" + ] + config.edge_buffer_size = 4 + config.feature_length = 256 + config.mode = "NLP" + config.model_max_batch_size = 32 + config.num_threads = 1 + config.pipeline_batch_size = 64 + yield config + + +def test_constructor(config: Config): + stage = PreprocessNLPStage(config) + assert stage.name == "preprocess-nlp" + assert stage._column == "data" + assert stage._seq_length == 256 + assert stage._vocab_hash_file.endswith("data/bert-base-cased-hash.txt") + assert stage._truncation is False + assert stage._do_lower_case is False + assert stage._add_special_tokens is False + + accepted_types = stage.accepted_types() + assert isinstance(accepted_types, tuple) + assert len(accepted_types) > 0 + + +@patch("morpheus.stages.preprocess.preprocess_nlp_stage.tokenize_text_series") +def test_process_control_message(mock_tokenize_text_series, config: Config): + mock_tokenized = Mock() + mock_tokenized.input_ids = cp.array([[1, 2], [1, 2]]) + mock_tokenized.input_mask = cp.array([[3, 4], [3, 4]]) + mock_tokenized.segment_ids = cp.array([[0, 0], [1, 1]]) + mock_tokenize_text_series.return_value = mock_tokenized + + stage = PreprocessNLPStage(config) + input_cm = ControlMessage() + df = cudf.DataFrame({"data": ["a", "b", "c"]}) + meta = MessageMeta(df) + input_cm.payload(meta) + + output_cm = stage.pre_process_batch(input_cm, + stage._vocab_hash_file, + stage._do_lower_case, + stage._seq_length, + stage._stride, + stage._truncation, + stage._add_special_tokens, + stage._column) + assert output_cm.get_metadata("inference_memory_params") == {"inference_type": "nlp"} + assert cp.array_equal(output_cm.tensors().get_tensor("input_ids"), mock_tokenized.input_ids) + assert cp.array_equal(output_cm.tensors().get_tensor("input_mask"), mock_tokenized.input_mask) + assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), mock_tokenized.segment_ids) + + +@patch("morpheus.stages.preprocess.preprocess_nlp_stage.tokenize_text_series") +def test_process_multi_message(mock_tokenize_text_series, config: Config): + mock_tokenized = Mock() + mock_tokenized.input_ids = cp.array([[1, 2], [1, 2]]) + mock_tokenized.input_mask = cp.array([[3, 4], [3, 4]]) + mock_tokenized.segment_ids = cp.array([[0, 0], [1, 1]]) + mock_tokenize_text_series.return_value = mock_tokenized + + stage = PreprocessNLPStage(config) + df = cudf.DataFrame({"data": ["a", "b", "c"]}) + meta = MessageMeta(df) + mess_offset = 0 + input_multi_message = MultiMessage(meta=meta, mess_offset=mess_offset, mess_count=2) + + output_infer_message = stage.pre_process_batch(input_multi_message, + stage._vocab_hash_file, + stage._do_lower_case, + stage._seq_length, + stage._stride, + stage._truncation, + stage._add_special_tokens, + stage._column) + assert cp.array_equal(output_infer_message.input_ids, mock_tokenized.input_ids) + assert cp.array_equal(output_infer_message.input_mask, mock_tokenized.input_mask) + mock_tokenized.segment_ids[:, 0] = mock_tokenized.segment_ids[:, 0] + mess_offset + assert cp.array_equal(output_infer_message.seq_ids, mock_tokenized.segment_ids) + + +@patch("morpheus.stages.preprocess.preprocess_nlp_stage.tokenize_text_series") +def test_process_control_message_and_multi_message(mock_tokenize_text_series, config: Config): + mock_tokenized = Mock() + mock_tokenized.input_ids = cp.array([[1, 2], [1, 2]]) + mock_tokenized.input_mask = cp.array([[3, 4], [3, 4]]) + mock_tokenized.segment_ids = cp.array([[0, 0], [1, 1]]) + mock_tokenize_text_series.return_value = mock_tokenized + + stage = PreprocessNLPStage(config) + df = cudf.DataFrame({"data": ["a", "b", "c"]}) + meta = MessageMeta(df) + input_control_message = ControlMessage() + input_control_message.payload(meta) + + mess_offset = 0 + input_multi_message = MultiMessage(meta=meta, mess_offset=mess_offset, mess_count=2) + + output_control_message = stage.pre_process_batch(input_control_message, + stage._vocab_hash_file, + stage._do_lower_case, + stage._seq_length, + stage._stride, + stage._truncation, + stage._add_special_tokens, + stage._column) + + output_infer_message = stage.pre_process_batch(input_multi_message, + stage._vocab_hash_file, + stage._do_lower_case, + stage._seq_length, + stage._stride, + stage._truncation, + stage._add_special_tokens, + stage._column) + + # Check if each tensor in the control message is equal to the corresponding tensor in the inference message + for tensor_key in output_control_message.tensors().tensor_names: + assert cp.array_equal(output_control_message.tensors().get_tensor(tensor_key), + getattr(output_infer_message, tensor_key)) diff --git a/tests/test_add_classifications_stage.py b/tests/test_add_classifications_stage.py index fd5bea944b..279963ba9a 100755 --- a/tests/test_add_classifications_stage.py +++ b/tests/test_add_classifications_stage.py @@ -20,7 +20,10 @@ import cudf from _utils.dataset_manager import DatasetManager +# pylint: disable=morpheus-incorrect-lib-from-import +from morpheus._lib.messages import TensorMemory as CppTensorMemory from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.messages.multi_response_message import MultiResponseMessage @@ -59,7 +62,7 @@ def test_constructor_errors(config: Config): @pytest.mark.use_python -def test_add_labels(): +def test_add_labels_with_multi_response_message_and_contgrol_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} @@ -69,37 +72,55 @@ def test_add_labels(): probs_array = cp.array([[0.1, 0.6, 0.8], [0.3, 0.61, 0.9]]) probs_array_bool = probs_array > threshold - message = MultiResponseMessage(meta=MessageMeta(df), memory=TensorMemory(count=2, tensors={"probs": probs_array})) + mrm = MultiResponseMessage(meta=MessageMeta(df), memory=TensorMemory(count=2, tensors={"probs": probs_array})) - labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold) + labeled_mrm = AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=threshold) - DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0]) - DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1]) - DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("frogs"), probs_array_bool[:, 0]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("lizards"), probs_array_bool[:, 1]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("toads"), probs_array_bool[:, 2]) + + cm = ControlMessage() + cm.payload(MessageMeta(df)) + cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array})) + + labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) + + # Check that the labeled control message and labeled multi response message are the same + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("frogs"), labeled_mrm.get_meta("frogs")) + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("lizards"), labeled_mrm.get_meta("lizards")) + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("toads"), labeled_mrm.get_meta("toads")) # Same thing but change the probs tensor name - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), - probs_tensor_name="other_probs") + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), + probs_tensor_name="other_probs") - labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold) + labeled_mrm = AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=threshold) - DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0]) - DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1]) - DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("frogs"), probs_array_bool[:, 0]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("lizards"), probs_array_bool[:, 1]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("toads"), probs_array_bool[:, 2]) # Fail in missing probs data - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), - probs_tensor_name="other_probs") - message.probs_tensor_name = "probs" + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), + probs_tensor_name="other_probs") + mrm.probs_tensor_name = "probs" with pytest.raises(KeyError): - AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold) + AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=threshold) # Too small of a probs array - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + + with pytest.raises(RuntimeError): + AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=threshold) + + cm = ControlMessage() + cm.payload(MessageMeta(df)) + cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): - AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold) + AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) diff --git a/tests/test_add_classifications_stage_pipe.py b/tests/test_add_classifications_stage_pipe.py index 03acc9e043..9a05bf04ac 100755 --- a/tests/test_add_classifications_stage_pipe.py +++ b/tests/test_add_classifications_stage_pipe.py @@ -21,6 +21,7 @@ from _utils import assert_results from _utils.stages.conv_msg import ConvMsg +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage from morpheus.messages import MultiResponseMessage @@ -47,15 +48,34 @@ def test_add_classifications_stage_pipe(config, filter_probs_df): config.num_threads = 1 threshold = 0.75 - pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(ConvMsg(config, filter_probs_df)) - pipe.add_stage(AddClassificationsStage(config, threshold=threshold)) - pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) - comp_stage = pipe.add_stage( + pipe_mm = LinearPipeline(config) + pipe_mm.set_source(InMemorySourceStage(config, [filter_probs_df])) + pipe_mm.add_stage(DeserializeStage(config)) + pipe_mm.add_stage(ConvMsg(config, filter_probs_df)) + pipe_mm.add_stage(AddClassificationsStage(config, threshold=threshold)) + pipe_mm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + comp_stage = pipe_mm.add_stage( CompareDataFrameStage(config, build_expected(filter_probs_df.to_pandas(), threshold, config.class_labels))) - pipe.run() + pipe_mm.run() + + assert_results(comp_stage.get_results()) + + +@pytest.mark.use_cudf +def test_add_classifications_stage_pipe_with_control_message(config, filter_probs_df): + config.class_labels = ['frogs', 'lizards', 'toads', 'turtles'] + config.num_threads = 1 + threshold = 0.75 + + pipe_cm = LinearPipeline(config) + pipe_cm.set_source(InMemorySourceStage(config, [filter_probs_df])) + pipe_cm.add_stage(DeserializeStage(config, ensure_sliceable_index=True, message_type=ControlMessage)) + pipe_cm.add_stage(ConvMsg(config, filter_probs_df, message_type=ControlMessage)) + pipe_cm.add_stage(AddClassificationsStage(config, threshold=threshold)) + pipe_cm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + comp_stage = pipe_cm.add_stage( + CompareDataFrameStage(config, build_expected(filter_probs_df.to_pandas(), threshold, config.class_labels))) + pipe_cm.run() assert_results(comp_stage.get_results()) @@ -66,19 +86,19 @@ def test_add_classifications_stage_multi_segment_pipe(config, filter_probs_df): config.num_threads = 1 threshold = 0.75 - pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) - pipe.add_segment_boundary(MessageMeta) - pipe.add_stage(DeserializeStage(config)) - pipe.add_segment_boundary(MultiMessage) - pipe.add_stage(ConvMsg(config, filter_probs_df)) - pipe.add_segment_boundary(MultiResponseMessage) - pipe.add_stage(AddClassificationsStage(config, threshold=threshold)) - pipe.add_segment_boundary(MultiResponseMessage) - pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) - pipe.add_segment_boundary(MessageMeta) - comp_stage = pipe.add_stage( + pipe_mm = LinearPipeline(config) + pipe_mm.set_source(InMemorySourceStage(config, [filter_probs_df])) + pipe_mm.add_segment_boundary(MessageMeta) + pipe_mm.add_stage(DeserializeStage(config)) + pipe_mm.add_segment_boundary(MultiMessage) + pipe_mm.add_stage(ConvMsg(config, filter_probs_df)) + pipe_mm.add_segment_boundary(MultiResponseMessage) + pipe_mm.add_stage(AddClassificationsStage(config, threshold=threshold)) + pipe_mm.add_segment_boundary(MultiResponseMessage) + pipe_mm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + pipe_mm.add_segment_boundary(MessageMeta) + comp_stage = pipe_mm.add_stage( CompareDataFrameStage(config, build_expected(filter_probs_df.to_pandas(), threshold, config.class_labels))) - pipe.run() + pipe_mm.run() assert_results(comp_stage.get_results()) diff --git a/tests/test_add_scores_stage.py b/tests/test_add_scores_stage.py index 2a343bcce0..ad67709959 100755 --- a/tests/test_add_scores_stage.py +++ b/tests/test_add_scores_stage.py @@ -19,8 +19,10 @@ import cudf +import morpheus._lib.messages as _messages from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.messages.multi_response_message import MultiResponseMessage @@ -61,43 +63,61 @@ def test_constructor_errors(config: Config): @pytest.mark.use_python -def test_add_labels(): +def test_add_labels_with_multi_response_message_and_control_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} df = cudf.DataFrame([0, 1], columns=["dummy"]) probs_array = cp.array([[0.1, 0.5, 0.8], [0.2, 0.6, 0.9]]) - message = MultiResponseMessage(meta=MessageMeta(df), memory=TensorMemory(count=2, tensors={"probs": probs_array})) + mrm = MultiResponseMessage(meta=MessageMeta(df), memory=TensorMemory(count=2, tensors={"probs": probs_array})) - labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None) + labeled_mrm = AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=None) - DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0]) - DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1]) - DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("frogs"), probs_array[:, 0]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("lizards"), probs_array[:, 1]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("toads"), probs_array[:, 2]) + + cm = ControlMessage() + cm.payload(MessageMeta(df)) + cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array})) + + labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) + + # Check that the labeled control message and labeled multi response message are the same + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("frogs"), labeled_mrm.get_meta("frogs")) + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("lizards"), labeled_mrm.get_meta("lizards")) + DatasetManager.assert_df_equal(labeled_cm.payload().get_data("toads"), labeled_mrm.get_meta("toads")) # Same thing but change the probs tensor name - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), - probs_tensor_name="other_probs") + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), + probs_tensor_name="other_probs") - labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None) + labeled_mrm = AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=None) - DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0]) - DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1]) - DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("frogs"), probs_array[:, 0]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("lizards"), probs_array[:, 1]) + DatasetManager.assert_df_equal(labeled_mrm.get_meta("toads"), probs_array[:, 2]) # Fail in missing probs data - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), - probs_tensor_name="other_probs") - message.probs_tensor_name = "probs" + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"other_probs": probs_array}), + probs_tensor_name="other_probs") + mrm.probs_tensor_name = "probs" with pytest.raises(KeyError): - AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None) + AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=None) # Too small of a probs array - message = MultiResponseMessage(meta=MessageMeta(df), - memory=TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + mrm = MultiResponseMessage(meta=MessageMeta(df), + memory=TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + + with pytest.raises(RuntimeError): + AddClassificationsStage._add_labels(mrm, idx2label=class_labels, threshold=None) + + cm = ControlMessage() + cm.payload(MessageMeta(df)) + cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): - AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None) + AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) diff --git a/tests/test_add_scores_stage_pipe.py b/tests/test_add_scores_stage_pipe.py index 72b5fe59e8..cdfc915bb2 100755 --- a/tests/test_add_scores_stage_pipe.py +++ b/tests/test_add_scores_stage_pipe.py @@ -24,6 +24,7 @@ from _utils.dataset_manager import DatasetManager from _utils.stages.conv_msg import ConvMsg from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage from morpheus.messages import MultiResponseMessage @@ -54,14 +55,25 @@ def test_add_scores_stage_pipe(config: Config, expected_df = dataset_pandas["filter_probs.csv"] expected_df = expected_df.rename(columns=dict(zip(expected_df.columns, config.class_labels))) - pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(ConvMsg(config, order=order, columns=list(input_df.columns))) - pipe.add_stage(AddScoresStage(config)) - pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) - comp_stage = pipe.add_stage(CompareDataFrameStage(config, expected_df)) - pipe.run() + pipe_mm = LinearPipeline(config) + pipe_mm.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe_mm.add_stage(DeserializeStage(config, ensure_sliceable_index=True, message_type=MultiMessage)) + pipe_mm.add_stage(ConvMsg(config, order=order, columns=list(input_df.columns))) + pipe_mm.add_stage(AddScoresStage(config)) + pipe_mm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + comp_stage = pipe_mm.add_stage(CompareDataFrameStage(config, expected_df)) + pipe_mm.run() + + assert_results(comp_stage.get_results()) + + pipe_cm = LinearPipeline(config) + pipe_cm.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe_cm.add_stage(DeserializeStage(config, ensure_sliceable_index=True, message_type=ControlMessage)) + pipe_cm.add_stage(ConvMsg(config, message_type=ControlMessage, order=order, columns=list(input_df.columns))) + pipe_cm.add_stage(AddScoresStage(config)) + pipe_cm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + comp_stage = pipe_cm.add_stage(CompareDataFrameStage(config, expected_df)) + pipe_cm.run() assert_results(comp_stage.get_results()) @@ -75,18 +87,18 @@ def test_add_scores_stage_multi_segment_pipe(config: Config, dataset_cudf: Datas filter_probs_df = dataset_cudf.pandas["filter_probs.csv"] expected_df = filter_probs_df.rename(columns=dict(zip(filter_probs_df.columns, config.class_labels))) - pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [dataset_cudf["filter_probs.csv"]], repeat=repeat)) - pipe.add_segment_boundary(MessageMeta) - pipe.add_stage(DeserializeStage(config)) - pipe.add_segment_boundary(MultiMessage) - pipe.add_stage(ConvMsg(config, columns=list(filter_probs_df.columns))) - pipe.add_segment_boundary(MultiResponseMessage) - pipe.add_stage(AddScoresStage(config)) - pipe.add_segment_boundary(MultiResponseMessage) - pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) - pipe.add_segment_boundary(MessageMeta) - comp_stage = pipe.add_stage(CompareDataFrameStage(config, expected_df)) - pipe.run() + pipe_mm = LinearPipeline(config) + pipe_mm.set_source(InMemorySourceStage(config, [dataset_cudf["filter_probs.csv"]], repeat=repeat)) + pipe_mm.add_segment_boundary(MessageMeta) + pipe_mm.add_stage(DeserializeStage(config)) + pipe_mm.add_segment_boundary(MultiMessage) + pipe_mm.add_stage(ConvMsg(config, columns=list(filter_probs_df.columns))) + pipe_mm.add_segment_boundary(MultiResponseMessage) + pipe_mm.add_stage(AddScoresStage(config)) + pipe_mm.add_segment_boundary(MultiResponseMessage) + pipe_mm.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) + pipe_mm.add_segment_boundary(MessageMeta) + comp_stage = pipe_mm.add_stage(CompareDataFrameStage(config, expected_df)) + pipe_mm.run() assert_results(comp_stage.get_results()) diff --git a/tests/tests_data/bools.csv b/tests/tests_data/bools.csv new file mode 100644 index 0000000000..04b48f7ffd --- /dev/null +++ b/tests/tests_data/bools.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3b28c6013aa66676adebcafeb433db8debd0af3ecf158a70c34e8cd435d222 +size 26 diff --git a/tests/tests_data/countries_sample.csv b/tests/tests_data/countries_sample.csv new file mode 100644 index 0000000000..8ef8a3c2c7 --- /dev/null +++ b/tests/tests_data/countries_sample.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82b02c9a42bfb7ed3c8ba5abce531bf613b3754c4da4105e525a112505f4c1e +size 50 diff --git a/tests/tests_data/csv_sample.csv b/tests/tests_data/csv_sample.csv new file mode 100644 index 0000000000..9d2aff44af --- /dev/null +++ b/tests/tests_data/csv_sample.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:977ef8a2d12b388e2dc6db474d00e0f488f1fe0fc733f88d51668ade50f5e9a5 +size 32 diff --git a/tests/tests_data/float_str.csv b/tests/tests_data/float_str.csv new file mode 100644 index 0000000000..aa71f48920 --- /dev/null +++ b/tests/tests_data/float_str.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cef5fa8f94abdcdd3521841741cfe83f965b9885ee5d667ee2ab634a4fed6cb7 +size 58 diff --git a/tests/tests_data/floats.csv b/tests/tests_data/floats.csv new file mode 100644 index 0000000000..505c7573d4 --- /dev/null +++ b/tests/tests_data/floats.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2976ed36677ac1692bf86c6bca39a145722d7dd2aed087487aec6567e0c2af31 +size 22