Skip to content

Commit 6fd86d4

Browse files
Change usage of NVML to DCGM (triton-inference-server#2924)
Changes on top of https://github.com/triton-inference-server/server/compare/hemantj-dcgm to change monitoring with NVML to DCGM. Co-authored-by: CoderHam <[email protected]>
1 parent 1941595 commit 6fd86d4

File tree

5 files changed

+363
-211
lines changed

5 files changed

+363
-211
lines changed

build.py

+31-12
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -477,10 +477,21 @@ def dali_cmake_args():
477477
]
478478

479479

480+
def install_dcgm_libraries():
481+
return '''
482+
# Install DCGM
483+
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common
484+
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \
485+
&& mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 \
486+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
487+
&& add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
488+
RUN apt-get update \
489+
&& apt-get install -y datacenter-gpu-manager
490+
'''
491+
492+
480493
def fil_cmake_args(images):
481-
cargs = [
482-
'-DTRITON_FIL_DOCKER_BUILD=ON'
483-
]
494+
cargs = ['-DTRITON_FIL_DOCKER_BUILD=ON']
484495
if 'base' in images:
485496
cargs.append('-DTRITON_BUILD_CONTAINER={}'.format(images['base']))
486497
else:
@@ -572,12 +583,17 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap, backends):
572583
RUN rm -fr *
573584
COPY . .
574585
ENTRYPOINT []
586+
'''
587+
df += install_dcgm_libraries()
588+
df += '''
589+
RUN patch -ruN -d /usr/include/ < /workspace/build/libdcgm/dcgm_api_export.patch
575590
'''
576591

577592
df += '''
578593
ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
579594
ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
580595
'''
596+
581597
mkdir(ddir)
582598
with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
583599
dfile.write(df)
@@ -668,6 +684,7 @@ def create_dockerfile_linux(ddir, dockerfile_name, argmap, backends, repoagents,
668684
libre2-5 && \
669685
rm -rf /var/lib/apt/lists/*
670686
'''
687+
df += install_dcgm_libraries()
671688
# Add dependencies needed for python backend
672689
if 'python' in backends:
673690
df += '''
@@ -736,7 +753,8 @@ def create_dockerfile_linux(ddir, dockerfile_name, argmap, backends, repoagents,
736753
dfile.write(df)
737754

738755

739-
def create_dockerfile_windows(ddir, dockerfile_name, argmap, backends, repoagents):
756+
def create_dockerfile_windows(ddir, dockerfile_name, argmap, backends,
757+
repoagents):
740758
df = '''
741759
#
742760
# Multistage build.
@@ -976,15 +994,16 @@ def container_build(images, backends, repoagents, endpoints):
976994
# the install artifacts from the tritonserver_build
977995
# container.
978996
if target_platform() == 'windows':
979-
create_dockerfile_windows(FLAGS.build_dir, 'Dockerfile', dockerfileargmap,
980-
backends, repoagents)
997+
create_dockerfile_windows(FLAGS.build_dir, 'Dockerfile',
998+
dockerfileargmap, backends, repoagents)
981999
else:
982-
create_dockerfile_linux(FLAGS.build_dir, 'Dockerfile', dockerfileargmap,
983-
backends, repoagents, endpoints)
1000+
create_dockerfile_linux(FLAGS.build_dir, 'Dockerfile',
1001+
dockerfileargmap, backends, repoagents,
1002+
endpoints)
9841003
p = subprocess.Popen([
985-
'docker', 'build', '-f',
986-
os.path.join(FLAGS.build_dir, 'Dockerfile')
987-
] + ['-t', 'tritonserver', '.'])
1004+
'docker', 'build', '-f',
1005+
os.path.join(FLAGS.build_dir, 'Dockerfile')
1006+
] + ['-t', 'tritonserver', '.'])
9881007
p.wait()
9891008
fail_if(p.returncode != 0, 'docker build tritonserver failed')
9901009

build/libdcgm/dcgm_api_export.patch

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
--- /dev/null 2021-05-28 06:28:50.736023234 +0000
2+
+++ /usr/include/dcgm_api_export.h 2021-05-28 18:29:49.516676962 +0000
3+
@@ -0,0 +1,14 @@
4+
+#ifndef DCGM_DCGM_API_EXPORT_H
5+
+#define DCGM_DCGM_API_EXPORT_H
6+
+#undef DCGM_PUBLIC_API
7+
+#undef DCGM_PRIVATE_API
8+
+#if defined(DCGM_API_EXPORT)
9+
+#define DCGM_PUBLIC_API __attribute((visibility("default")))
10+
+#else
11+
+#define DCGM_PUBLIC_API
12+
+#if defined(ERROR_IF_NOT_PUBLIC)
13+
+#error(Should be public)
14+
+#endif
15+
+#endif
16+
+#define DCGM_PRIVATE_API __attribute((visibility("hidden")))
17+
+#endif // DCGM_DCGM_API_EXPORT_H

0 commit comments

Comments
 (0)