diff --git a/man/man1/pmdaamdgpu.1 b/man/man1/pmdaamdgpu.1 new file mode 100644 index 0000000000..0d13ffda23 --- /dev/null +++ b/man/man1/pmdaamdgpu.1 @@ -0,0 +1,156 @@ +'\"macro stdmacro +.\" +.\" Copyright (c) 2024 Red Hat. +.\" +.\" This program is free software; you can redistribute it and/or modify it +.\" under the terms of the GNU General Public License as published by the +.\" Free Software Foundation; either version 2 of the License, or (at your +.\" option) any later version. +.\" +.\" This program is distributed in the hope that it will be useful, but +.\" WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +.\" or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +.\" for more details. +.\" +.TH PMDAAMDGPU 1 "PCP" "Performance Co-Pilot" +.SH NAME +\f3pmdaamdgpu\f1 \- amdgpu gpu metrics domain agent (PMDA) +.SH SYNOPSIS +\f3$PCP_PMDAS_DIR/amdgpu/pmdaamdgpu\f1 +[\f3\-d\f1 \f2domain\f1] +[\f3\-l\f1 \f2logfile\f1] +[\f3\-t\f1 \f2interval\f1] +.SH DESCRIPTION +.B pmdaamdgpu +is a Performance Metrics Domain Agent (PMDA) which extracts +performance metrics describing the metrics available on AMDGPU +GPU cards via the DRM library. +.PP +The +.B amdgpu +PMDA exports metrics that measure gpu activity, memory utilization, +temperature, etc on GCN 1.2+ AMD GPUs. +.PP +A brief description of the +.B pmdaamdgpu +command line options follows: +.TP 5 +.B \-d +It is absolutely crucial that the performance metrics +.I domain +number specified here is unique and consistent. +That is, +.I domain +should be different for every PMDA on the one host, and the same +.I domain +number should be used for the same PMDA on all hosts. +.TP +.B \-l +Location of the log file. By default, a log file named +.I amdgpu.log +is written in the current directory of +.BR pmcd (1) +when +.B pmdaamdgpu +is started, i.e. +.BR $PCP_LOG_DIR/pmcd . +If the log file cannot +be created or is not writable, output is written to the standard error instead. +.TP +.B \-t +Enables and sets a sampling +.I interval +for automatic refreshing of metric values. +The functionality is disabled by default, however this option allows +a time interval to be specified on which all values are sampled \- this +has the effect of constantly updating the accumulating metrics, with the +goal of assisting client tools such as +.BR pcp-atop (1) +and +.BR pmlogger (1) +to observe sub-sample time changes in GPU and process state. +Typically these tools have longer sampling intervals, and can thus 'miss' +activity happening during their sampling interval. +.SH INSTALLATION +The +.B amdgpu +PMDA is not installed and available by default. +If you want to undo the installation, do the following as root: +.PP +.ft CR +.nf +.in +0.5i +# cd $PCP_PMDAS_DIR/amdgpu +# ./Remove +.in +.fi +.ft 1 +.PP +If you want to establish access to the names, help text and values for the amdgpu +performance metrics once more, after removal, do the following as root: +.PP +.ft CR +.nf +.in +0.5i +# cd $PCP_PMDAS_DIR/amdgpu +# ./Install +.in +.fi +.ft 1 +.PP +.B pmdaamdgpu +is launched by +.BR pmcd (1) +and should never be executed directly. +The Install and Remove scripts notify +.BR pmcd (1) +when the agent is installed or removed. +.SH FILES +.PD 0 +.TP 10 +.B $PCP_PMCDCONF_PATH +command line options used to launch +.B pmdaamdgpu +.TP 10 +.B $PCP_PMDAS_DIR/amdgpu/help +default help text file for the amdgpu metrics +.TP 10 +.B $PCP_PMDAS_DIR/amdgpu/Install +installation script for the +.B pmdaamdgpu +agent +.TP 10 +.B $PCP_PMDAS_DIR/amdgpu/Remove +undo installation script for the +.B pmdaamdgpu +agent +.TP 10 +.B $PCP_LOG_DIR/pmcd/amdgpu.log +default log file for error messages and other information from +.B pmdaamdgpu +.PD +.SH "PCP ENVIRONMENT" +Environment variables with the prefix +.B PCP_ +are used to configure the file and directory names +used by PCP. +On each installation, the file +.I /etc/pcp.conf +contains the local values for these variables. +The +.B $PCP_CONF +variable may be used to specify an alternative +configuration file, +as described in +.BR pcp.conf (5). +.SH SEE ALSO +.BR PCPIntro (1), +.BR pcp-atop (1), +.BR pmcd (1), +.BR pmlogger (1), +.BR pcp.conf (5) +and +.BR pcp.env (5). + +.\" control lines for scripts/man-spell +.\" +ok+ DRM gpu amdgpu GCN diff --git a/src/pmdas/GNUmakefile b/src/pmdas/GNUmakefile index 0ffd944542..b906d4193e 100644 --- a/src/pmdas/GNUmakefile +++ b/src/pmdas/GNUmakefile @@ -27,7 +27,7 @@ CPMDAS = root pmcd \ gfs2 jbd2 cifs nvidia resctrl perfevent \ dm pipe openbsd docker smart podman statsd \ hacluster linux_sockets denki bpf overhead \ - farm + farm amdgpu PLPMDAS = bonding netfilter zimbra postgresql \ dbping memcache mysql oracle kvm \ diff --git a/src/pmdas/amdgpu/.gitignore b/src/pmdas/amdgpu/.gitignore new file mode 100644 index 0000000000..aa968097e7 --- /dev/null +++ b/src/pmdas/amdgpu/.gitignore @@ -0,0 +1,7 @@ +help.dir +help.pag +domain.h +pmdaamd +pmda_amd.so +pmda_amd.dll +pmda_amd.dylib diff --git a/src/pmdas/amdgpu/GNUmakefile b/src/pmdas/amdgpu/GNUmakefile new file mode 100644 index 0000000000..7eb916ef3a --- /dev/null +++ b/src/pmdas/amdgpu/GNUmakefile @@ -0,0 +1,51 @@ +# +# Copyright (c) 2014-2015,2020 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +TOPDIR = ../../.. +include $(TOPDIR)/src/include/builddefs + +IAM = amdgpu +DOMAIN = AMDGPU + +CMDTARGET = pmdaamd$(EXECSUFFIX) +LIBTARGET = pmda_amd.$(DSOSUFFIX) +CFILES = localdrm.c amdgpu.c +HFILES = localdrm.h +DFILES = README +LLDLIBS = $(PCP_PMDALIB) $(LIB_FOR_DLOPEN) -ldrm -ldrm_amdgpu +LCFLAGS += -DDSOSUFFIX=\"$(DSOSUFFIX)\" -I/usr/include/drm +LDIRT = domain.h *.log *.dir *.pag so_locations + +PMDAADMDIR = $(PCP_PMDASADM_DIR)/$(IAM) +PMDATMPDIR = $(PCP_PMDAS_DIR)/$(IAM) + +default: $(LIBTARGET) $(CMDTARGET) + +include $(BUILDRULES) + +install: default + $(INSTALL) -m 755 -d $(PMDAADMDIR) + $(INSTALL) -m 755 -d $(PMDATMPDIR) + $(INSTALL) -m 755 -t $(PMDATMPDIR) Install Remove $(PMDAADMDIR) + $(INSTALL) -m 755 -t $(PMDATMPDIR) $(LIBTARGET) $(CMDTARGET) $(PMDAADMDIR) + $(INSTALL) -m 644 -t $(PMDATMPDIR) $(DFILES) root help pmns domain.h $(PMDAADMDIR) + +$(OBJECTS): domain.h + +domain.h: ../../pmns/stdpmid + $(DOMAIN_MAKERULE) + +default_pcp: default + +install_pcp: install diff --git a/src/pmdas/amdgpu/Install b/src/pmdas/amdgpu/Install new file mode 100755 index 0000000000..faaf1978da --- /dev/null +++ b/src/pmdas/amdgpu/Install @@ -0,0 +1,26 @@ +#! /bin/sh +# +# Copyright (c) 2024 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Install the trivial PMDA and/or PMNS +# + +. $PCP_DIR/etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +iam=amdgpu +dso_opt=true + +pmdaSetup +pmdaInstall +exit diff --git a/src/pmdas/amdgpu/README b/src/pmdas/amdgpu/README new file mode 100755 index 0000000000..b46527c5fd --- /dev/null +++ b/src/pmdas/amdgpu/README @@ -0,0 +1,6 @@ +Readme +AMD GPU PMDA +=========== + +The AMD GPU PMDA is a PCP module for gathering metrics on the performance of +AMD graphics cards. It uses the libdrm to query the states of attached cards. diff --git a/src/pmdas/amdgpu/Remove b/src/pmdas/amdgpu/Remove new file mode 100755 index 0000000000..0ace6a096f --- /dev/null +++ b/src/pmdas/amdgpu/Remove @@ -0,0 +1,38 @@ +#! /bin/sh +# +# Copyright (c) 2024 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# Remove the amd PMDA +# + +# source the PCP configuration environment variables +. $PCP_DIR/etc/pcp.env + +# Get the common procedures and variable assignments +# +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +# The name of the PMDA +# +iam=amdgpu + +# Do it +# +pmdaSetup +pmdaRemove + +exit diff --git a/src/pmdas/amdgpu/amdgpu.c b/src/pmdas/amdgpu/amdgpu.c new file mode 100644 index 0000000000..599378a024 --- /dev/null +++ b/src/pmdas/amdgpu/amdgpu.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2024 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#include "pmapi.h" +#include "pmda.h" + +#include "domain.h" +#include "libpcp.h" + +#include "localdrm.h" +#include + +/* InDom table (set of graphics cards, set of processes+devices) */ +enum { GCARD_INDOM = 0 }; +pmdaIndom indomtab[] = { + {GCARD_INDOM, 0, NULL}, +}; + +/* List of metric item numbers - increasing from zero, no holes. + * Double check against `pmns` definition file. + */ +enum { + AMDGPU_NUMCARDS = 0, + AMDGPU_CARDNAME, + AMDGPU_MEMUSED, + AMDGPU_MEMTOTAL, + AMDGPU_MEMFREE, + AMDGPU_SAMPLES, + AMDGPU_MEMUSED_ACCUM, + AMDGPU_GPU_CLOCK, + AMDGPU_GPU_CLOCK_MAX, + AMDGPU_MEMORY_CLOCK, + AMDGPU_MEMORY_CLOCK_MAX, + AMDGPU_TEMPERATURE, + AMDGPU_GPU_LOAD, + AMDGPU_GPU_AVG_PWR, + + AMDGPU_METRIC_COUNT +}; + +/* Table of metrics exported by this PMDA */ +static pmdaMetric metrictab[] = { + {NULL, + {PMDA_PMID(0, AMDGPU_NUMCARDS), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0, 0, 0, 0, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_CARDNAME), PM_TYPE_STRING, GCARD_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0, 0, 0, 0, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMUSED), PM_TYPE_U64, GCARD_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMTOTAL), PM_TYPE_U64, GCARD_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMFREE), PM_TYPE_U64, GCARD_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_SAMPLES), PM_TYPE_U64, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0, 0, 0, 0, 0, 0)}}, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMUSED_ACCUM), PM_TYPE_U64, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_GPU_CLOCK), PM_TYPE_U32, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_GPU_CLOCK_MAX), PM_TYPE_U32, GCARD_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMORY_CLOCK), PM_TYPE_U32, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_MEMORY_CLOCK_MAX), PM_TYPE_U32, GCARD_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_TEMPERATURE), PM_TYPE_U32, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_GPU_LOAD), PM_TYPE_U32, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, + {NULL, + {PMDA_PMID(0, AMDGPU_GPU_AVG_PWR), PM_TYPE_U32, GCARD_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1, 0, 0, 0, 0, 0) } }, +}; + +/* GCARD_INDOM struct, stats that are per card */ +typedef struct { + int32_t cardid; + int32_t failed[AMDGPU_METRIC_COUNT]; + char name[64]; + struct amdgpu_gpu_info gpu_info; + uint64_t samples; + uint64_t memaccum; + drmMemory_t memory; + uint32_t gpu_clock; + uint32_t mem_clock; + uint32_t temperature; + uint32_t load; + uint32_t avg_pwr; +} amdgpu_info_t; + +/* overall struct, holds instance values, indom and instance struct arrays */ +typedef struct { + uint32_t numcards; + uint32_t maxcards; + drmDevicePtr *devs; + amdgpu_info_t *info; + pmdaIndom *indom; +} pcp_amdgpuinfo_t; + +static pcp_amdgpuinfo_t pcp_amdgpuinfo; +static char mypath[MAXPATHLEN]; +static int isDSO = 1; +static int drm_initialized; +static int autorefresh = -1; +static struct timeval interval; + +static int setup_gcard_indom(void) { + unsigned int device_count = 0; + pmdaIndom *idp = &indomtab[GCARD_INDOM]; + char gpuname[32], *name; + int i, sts; + + /* Initialize instance domain and instances. */ + if ((sts = localDRMDeviceGetDevices(&pcp_amdgpuinfo.devs, + &pcp_amdgpuinfo.maxcards, + &pcp_amdgpuinfo.numcards)) != DRM_SUCCESS) { + pmNotifyErr(LOG_ERR, "DrmDeviceGetDevies: %s", localDRMErrStr(sts)); + return sts; + } + + device_count = pcp_amdgpuinfo.numcards; + pmNotifyErr(LOG_WARNING, "setup_gcard_indom: got %d cards", device_count); + + pcp_amdgpuinfo.indom = idp; + pcp_amdgpuinfo.indom->it_numinst = 0; + pcp_amdgpuinfo.indom->it_set = + (pmdaInstid *)calloc(device_count, sizeof(pmdaInstid)); + + if (!pcp_amdgpuinfo.indom->it_set) { + pmNoMem("gcard indom", device_count * sizeof(pmdaInstid), PM_RECOV_ERR); + free(pcp_amdgpuinfo.devs); + return -ENOMEM; + } + + if ((pcp_amdgpuinfo.info = (amdgpu_info_t *)calloc( + device_count, sizeof(amdgpu_info_t))) == NULL) { + pmNoMem("gcard values", device_count * sizeof(amdgpu_info_t), PM_RECOV_ERR); + free(pcp_amdgpuinfo.devs); + free(pcp_amdgpuinfo.indom->it_set); + return -ENOMEM; + } + + for (i = 0; i < device_count; i++) { + drmDevicePtr dev = pcp_amdgpuinfo.devs[i]; + + pcp_amdgpuinfo.indom->it_set[i].i_inst = i; + pmsprintf(gpuname, sizeof(gpuname), "gpu%d", i); + if ((name = strdup(gpuname)) == NULL) { + pmNoMem("gcard instname", strlen(gpuname), PM_RECOV_ERR); + while (--i) + free(pcp_amdgpuinfo.indom->it_set[i].i_name); + free(pcp_amdgpuinfo.devs); + free(pcp_amdgpuinfo.indom->it_set); + free(pcp_amdgpuinfo.info); + return -ENOMEM; + } + pcp_amdgpuinfo.indom->it_set[i].i_name = name; + + /* Get static values */ + if (localDRMDeviceGetGPUInfo(dev, &pcp_amdgpuinfo.info[i].gpu_info)) { + pcp_amdgpuinfo.info[i].failed[AMDGPU_MEMORY_CLOCK_MAX] = 1; + pcp_amdgpuinfo.info[i].failed[AMDGPU_GPU_CLOCK_MAX] = 1; + } + } + + pcp_amdgpuinfo.indom->it_numinst = device_count; + return 0; +} + +static int refresh(pcp_amdgpuinfo_t *amdgpuinfo) { + drmMemory_t memory = {0}; + int i, j; + + if (!drm_initialized) { + setup_gcard_indom(); + drm_initialized = 1; + } + + for (i = 0; i < amdgpuinfo->numcards && i < amdgpuinfo->maxcards; i++) { + amdgpu_info_t *info = &amdgpuinfo->info[i]; + drmDevicePtr dev = amdgpuinfo->devs[i]; + + info->cardid = i; + for (j = 0; j < AMDGPU_METRIC_COUNT; j++) + info->failed[j] = 0; + if (localDRMDeviceGetName(dev, info->name)) + info->failed[AMDGPU_CARDNAME] = 1; + if (localDRMDeviceGetMemoryInfo(dev, &memory)) { + info->failed[AMDGPU_MEMUSED] = 1; + info->failed[AMDGPU_MEMTOTAL] = 1; + info->failed[AMDGPU_MEMFREE] = 1; + } + if (localDRMDeviceGetMemoryClock(dev, &info->mem_clock)) { + info->failed[AMDGPU_MEMORY_CLOCK] = 1; + } + if (localDRMDeviceGetGPUClock(dev, &info->gpu_clock)) { + info->failed[AMDGPU_GPU_CLOCK] = 1; + } + if (localDRMDeviceGetTemperature(dev, &info->temperature)) { + info->failed[AMDGPU_TEMPERATURE] = 1; + } + if (localDRMDeviceGetGPULoad(dev, &info->load)) { + info->failed[AMDGPU_GPU_LOAD] = 1; + } + if (localDRMDeviceGetGPUAveragePower(dev, &info->avg_pwr)) { + info->failed[AMDGPU_GPU_AVG_PWR] = 1; + } + + info->memory = memory; /* struct copy */ + info->memaccum += memory.used; + info->samples++; + } + + return 0; +} + +static int amdgpu_instance(pmInDom indom, int inst, char *name, + pmInResult **result, pmdaExt *pmda) { + return pmdaInstance(indom, inst, name, result, pmda); +} + +/* + * Wrapper for pmdaFetch which refresh the set of values once per fetch + * PDU. The fetchCallback is then called once per-metric/instance pair + * to perform the actual filling of the pmResult (via each pmAtomValue). + */ +static int amdgpu_fetch(int numpmid, pmID pmidlist[], pmResult **resp, + pmdaExt *pmda) +{ + refresh(&pcp_amdgpuinfo); + return pmdaFetch(numpmid, pmidlist, resp, pmda); +} + +static int amdgpu_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, + pmAtomValue *atom) { + unsigned int cluster = pmID_cluster(mdesc->m_desc.pmid); + unsigned int item = pmID_item(mdesc->m_desc.pmid); + + if (item != 0 && cluster == 0 && inst > indomtab[GCARD_INDOM].it_numinst) + return PM_ERR_INST; + + switch (cluster) { + case 0: /* amdgpu general and per-card metrics */ + switch (item) { + case AMDGPU_NUMCARDS: + atom->ul = pcp_amdgpuinfo.numcards; + break; + case AMDGPU_SAMPLES: + atom->ull = pcp_amdgpuinfo.info[inst].samples; + break; + case AMDGPU_CARDNAME: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_CARDNAME]) + return PM_ERR_VALUE; + atom->cp = pcp_amdgpuinfo.info[inst].name; + break; + case AMDGPU_MEMUSED: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMUSED]) + return PM_ERR_VALUE; + atom->ull = pcp_amdgpuinfo.info[inst].memory.used; + break; + case AMDGPU_MEMTOTAL: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMTOTAL]) + return PM_ERR_VALUE; + atom->ull = pcp_amdgpuinfo.info[inst].memory.total; + break; + case AMDGPU_MEMFREE: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMFREE]) + return PM_ERR_VALUE; + atom->ull = + pcp_amdgpuinfo.info[inst].memory.usable - + pcp_amdgpuinfo.info[inst].memory.used; + break; + case AMDGPU_MEMUSED_ACCUM: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMUSED_ACCUM]) + return PM_ERR_VALUE; + atom->ull = pcp_amdgpuinfo.info[inst].memaccum; + break; + case AMDGPU_GPU_CLOCK: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_GPU_CLOCK]) + return PM_ERR_VALUE; + /* The GPU speed is the memory clock (GFX_MCLK) + * The GDDRx memory speed is the shader clock (GFX_SCLK) + * In MHz + */ + atom->ul = pcp_amdgpuinfo.info[inst].gpu_clock; + break; + case AMDGPU_GPU_CLOCK_MAX: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_GPU_CLOCK_MAX]) + return PM_ERR_VALUE; + /* The GPU max speed is the max_memory_clk. + * The GDDRx memory max speed is max_engine_clk + * In MHz + */ + atom->ul = pcp_amdgpuinfo.info[inst].gpu_info.max_memory_clk; + break; + case AMDGPU_MEMORY_CLOCK: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMORY_CLOCK]) + return PM_ERR_VALUE; + /* The GPU speed is the memory clock (GFX_MCLK) + * The GDDRx memory speed is the shader clock (GFX_SCLK) + * In MHz + */ + atom->ul = pcp_amdgpuinfo.info[inst].mem_clock; + break; + case AMDGPU_MEMORY_CLOCK_MAX: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_MEMORY_CLOCK_MAX]) + return PM_ERR_VALUE; + /* The GPU max speed is the max_memory_clk. + * The GDDRx memory max speed is max_engine_clk + * In MHz + */ + atom->ul = pcp_amdgpuinfo.info[inst].gpu_info.max_engine_clk; + break; + case AMDGPU_TEMPERATURE: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_TEMPERATURE]) + return PM_ERR_VALUE; + /* In millidegrees Celsius */ + atom->ul = pcp_amdgpuinfo.info[inst].temperature; + break; + case AMDGPU_GPU_LOAD: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_GPU_LOAD]) + return PM_ERR_VALUE; + atom->ul = pcp_amdgpuinfo.info[inst].load; + break; + case AMDGPU_GPU_AVG_PWR: + if (pcp_amdgpuinfo.info[inst].failed[AMDGPU_GPU_AVG_PWR]) + return PM_ERR_VALUE; + atom->ul = pcp_amdgpuinfo.info[inst].avg_pwr; + break; + default: + return PM_ERR_PMID; + } + break; + default: + return PM_ERR_PMID; + } + + return 1; +} + +static int amdgpu_labelCallBack(pmInDom indom, unsigned int inst, + pmLabelSet **lp) { + if (indom == PM_INDOM_NULL) + return 0; + + switch (pmInDom_serial(indom)) { + case GCARD_INDOM: + return pmdaAddLabels(lp, "{\"gpu\":%s}", pcp_amdgpuinfo.info[inst].name); + default: + break; + } + return 0; +} + +static int amdgpu_labelInDom(pmInDom indom, pmLabelSet **lp) { + switch (pmInDom_serial(indom)) { + case GCARD_INDOM: + pmdaAddLabels(lp, "{\"device_type\":\"gpu\"}"); + pmdaAddLabels(lp, "{\"indom_name\":\"per gpu\"}"); + return 1; + default: + break; + } + return 0; +} + +static int amdgpu_label(int ident, int type, pmLabelSet **lpp, pmdaExt *pmda) { + switch (type) { + case PM_LABEL_INDOM: + amdgpu_labelInDom((pmInDom)ident, lpp); + break; + default: + break; + } + return pmdaLabel(ident, type, lpp, pmda); +} + +/** + * Initializes the path to the help file for this PMDA. + */ +static void initializeHelpPath() { + int sep = pmPathSeparator(); + pmsprintf(mypath, sizeof(mypath), + "%s%c" + "amdgpu" + "%c" + "help", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); +} + +void __PMDA_INIT_CALL amdgpu_init(pmdaInterface *dp) { + if (isDSO) { + initializeHelpPath(); + pmdaDSO(dp, PMDA_INTERFACE_7, "amdgpu DSO", mypath); + } + + if (dp->status != 0) + return; + + if (!drm_initialized) { + setup_gcard_indom(); + drm_initialized = 1; + } + + dp->version.seven.instance = amdgpu_instance; + dp->version.seven.fetch = amdgpu_fetch; + dp->version.seven.label = amdgpu_label; + pmdaSetFetchCallBack(dp, amdgpu_fetchCallBack); + pmdaSetLabelCallBack(dp, amdgpu_labelCallBack); + + pmdaInit(dp, indomtab, sizeof(indomtab) / sizeof(indomtab[0]), metrictab, + sizeof(metrictab) / sizeof(metrictab[0])); +} + +static void amdgpu_timer(int sig, void *ptr) { + (void)sig; + (void)ptr; + autorefresh = 1; +} + +static void amdgpu_main(pmdaInterface *dispatch) { + fd_set readyfds, fds; + int pmcdfd, maxfd = 0; + + if ((pmcdfd = __pmdaInFd(dispatch)) < 0) + exit(1); + if (pmcdfd > maxfd) + maxfd = pmcdfd; + + FD_ZERO(&fds); + FD_SET(pmcdfd, &fds); + + /* arm interval timer */ + if (autorefresh == 1 && __pmAFregister(&interval, NULL, amdgpu_timer) < 0) { + pmNotifyErr(LOG_ERR, "registering event interval handler"); + exit(1); + } + + for (;;) { + memcpy(&readyfds, &fds, sizeof(readyfds)); + int nready = select(maxfd + 1, &readyfds, NULL, NULL, NULL); + if (pmDebugOptions.appl2) + pmNotifyErr(LOG_DEBUG, "select: nready=%d autorefresh=%d", nready, + autorefresh); + if (nready < 0) { + if (neterror() != EINTR) { + pmNotifyErr(LOG_ERR, "select failure: %s", netstrerror()); + exit(1); + } else if (autorefresh == 0) { + continue; + } + } + + __pmAFblock(); + if (nready > 0 && FD_ISSET(pmcdfd, &readyfds)) { + if (pmDebugOptions.appl0) + pmNotifyErr(LOG_DEBUG, "processing pmcd PDU [fd=%d]", pmcdfd); + if (__pmdaMainPDU(dispatch) < 0) { + __pmAFunblock(); + exit(1); /* fatal if we lose pmcd */ + } + if (pmDebugOptions.appl0) + pmNotifyErr(LOG_DEBUG, "completed pmcd PDU [fd=%d]", pmcdfd); + } + if (autorefresh > 0) { + autorefresh = 0; + refresh(&pcp_amdgpuinfo); + } + __pmAFunblock(); + } +} + +static pmLongOptions longopts[] = {PMDA_OPTIONS_HEADER("Options"), + PMOPT_DEBUG, + PMDAOPT_DOMAIN, + PMDAOPT_LOGFILE, + PMOPT_INTERVAL, + PMOPT_HELP, + PMDA_OPTIONS_END}; + +static pmdaOptions opts = { + .short_options = "D:d:l:t:?", + .long_options = longopts, +}; + +int main(int argc, char **argv) { + pmdaInterface desc = {0}; + char *endnum = NULL; + int c; + + isDSO = 0; + pmSetProgname(argv[0]); + + initializeHelpPath(); + pmdaDaemon(&desc, PMDA_INTERFACE_7, pmGetProgname(), AMDGPU, "amdgpu.log", + mypath); + + while ((c = pmdaGetOptions(argc, argv, &opts, &desc)) != EOF) { + switch (c) { + case 't': + if (pmParseInterval(opts.optarg, &interval, &endnum) < 0) { + fprintf(stderr, "%s: -t requires a time interval: %s\n", + pmGetProgname(), endnum); + free(endnum); + opts.errors++; + } + autorefresh = 1; /* enable timers, non-default */ + break; + default: + opts.errors++; + break; + } + } + if (opts.errors) { + pmdaUsageMessage(&opts); + exit(1); + } + + pmdaOpenLog(&desc); + pmdaConnect(&desc); + amdgpu_init(&desc); + amdgpu_main(&desc); + + exit(0); +} diff --git a/src/pmdas/amdgpu/help b/src/pmdas/amdgpu/help new file mode 100644 index 0000000000..ff081d7ba4 --- /dev/null +++ b/src/pmdas/amdgpu/help @@ -0,0 +1,66 @@ +# +# Copyright (c) 2014,2019,2021 Red Hat. +# Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# AMD PMDA help file in the ASCII format +# +# lines beginning with a # are ignored +# lines beginning @ introduce a new entry of the form +# @ metric_name oneline-text +# help text goes +# here over multiple lines +# ... +# +# the metric_name is decoded against the default PMNS -- as a special case, +# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an +# instance domain identification, and the text describes the instance domain +# +# blank lines before the @ line are ignored +# + +@ AMDGPU.0 AMD graphics cards installed in this system + +@ amdgpu.numcards Number of Graphics Cards +The number of AMD Graphics cards installed in this system + +@ amdgpu.cardname GPU Name +The name of the graphics card + +@ amdgpu.memused Allocated frame buffer memory +Amount of GPU FB memory that has currently been allocated, in bytes. +Note that the driver/GPU always sets aside a small amount of memory +for bookkeeping. + +@ amdgpu.memtotal Total frame buffer memory available +The total amount of GPU FB memory available on the card, in bytes. + +@ amdgpu.memfree Unallocated frame buffer memory +Amount of GPU FB memory that is not currently allocated, in bytes. + +@ amdgpu.samples Count of value refreshes for each GPU + +@ amdgpu.memusedaccum Cumulative counter of used memory for each GPU + +@ amdgpu.mem_clock The GDDRx memory clock speed in MHz +@ amdgpu.mem_clock_max The maximum GDDRx memory clock speed in MHz + +@ amdgpu.gpu_clock The GPU clock speed in MHz +@ amdgpu.gpu_clock_max The maximum GPU clock speed in MHz + +@ amdgpu.temperature The GPU temperature in millidegrees Celsius +The temperature provided is returned in millidegrees celcius, but the +GPU may not have this precision. + +@ amdgpu.load The GPU load + +@ amdgpu.avg_pwr The GPU average power consumption diff --git a/src/pmdas/amdgpu/localdrm.c b/src/pmdas/amdgpu/localdrm.c new file mode 100644 index 0000000000..d774c341e5 --- /dev/null +++ b/src/pmdas/amdgpu/localdrm.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2024 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#include +#include +#include +#include +#include + +#if defined(HAVE_DLFCN_H) +#include +#endif +#include "localdrm.h" + +#include +#include +#include + +#ifndef DSOSUFFIX +#define DSOSUFFIX "so" +#endif + +/* Looks like AMD kept PCI_VENDOR_ID_ATI for its GPU IDs */ +#define PCI_VENDOR_ID_ATI 0x1002 + +int localDRMShutdown(drmDevicePtr devs[], uint32_t count) { + drmFreeDevices(devs, count); + + free(devs); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetDevices(drmDevicePtr *devs[], + uint32_t *max, + uint32_t *count) { + uint32_t amdgpu_count = 0; + drmDevicePtr *temp = NULL; /* Will contain all devices, including non-amd ones*/ + drmDevicePtr *p = NULL; /* Helper to copy AMD device data */ + + /* First get the total count of devices */ + int dev_count = drmGetDevices(NULL, 0); + + if (dev_count <= 0) { + printf("No devices\n"); + return DRM_ERROR_NOT_FOUND; + } + + /* Allocate space to store device data */ + temp = (drmDevicePtr *)calloc(dev_count, sizeof(drmDevicePtr)); + if (!temp) { + printf("No memory\n"); + return DRM_ERROR_MEMORY; + } + + /* Allocate space for the devices given back to the user */ + p = *devs = (drmDevicePtr *)calloc(dev_count, sizeof(drmDevicePtr)); + if (!*devs) { + printf("No memory\n"); + free(temp); + return DRM_ERROR_MEMORY; + } + + dev_count = drmGetDevices(temp, dev_count); + + if (dev_count <= 0) { + printf("Failed to retrieve devices\n"); + free(temp); + free(*devs); + *devs = NULL; + return DRM_ERROR_NOT_FOUND; + } + + /* Walk through the devices, and keep the AMD GPU ones */ + for (uint32_t i = 0; i < dev_count; i++) { + if (temp[i]->bustype != DRM_BUS_PCI || + temp[i]->deviceinfo.pci->vendor_id != PCI_VENDOR_ID_ATI) + continue; + + int fd = -1; + + // Try render node first + if (1 << DRM_NODE_RENDER & temp[i]->available_nodes) { + fd = open(temp[i]->nodes[DRM_NODE_RENDER], O_RDWR); + } + + if (fd < 0) { + // Fallback to primary node + if (1 << DRM_NODE_PRIMARY & temp[i]->available_nodes) { + fd = open(temp[i]->nodes[DRM_NODE_PRIMARY], O_RDWR); + } + } + + if (fd < 0) + continue; + + /* Check the version, as it contains the driver name */ + drmVersionPtr ver = drmGetVersion(fd); + + if (!ver) { + close(fd); + continue; + } + + if (strcmp(ver->name, "amdgpu")) { + drmFreeVersion(ver); + continue; + } + + /* Copy the AMD GPU data */ + memcpy(&p[amdgpu_count++], &temp[i], sizeof(drmDevicePtr)); + + /* Done with version */ + drmFreeVersion(ver); + } + + *max = dev_count; + *count = amdgpu_count; + + /* Done with all devices (we copied the ones needed */ + free(temp); + + return DRM_SUCCESS; +} + +static int getAMDDevice(drmDevicePtr dev, amdgpu_device_handle *amd_dev, int *fd) { + uint32_t drm_major, drm_minor; + + // Try render node first + if (1 << DRM_NODE_RENDER & dev->available_nodes) { + *fd = open(dev->nodes[DRM_NODE_RENDER], O_RDWR); + } + if (*fd < 0) { + // Fallback to primary node + if (1 << DRM_NODE_PRIMARY & dev->available_nodes) { + *fd = open(dev->nodes[DRM_NODE_PRIMARY], O_RDWR); + } + } + + if (*fd < 0) + return DRM_ERROR_INSUFFICIENT_RESOURCES; + + /* Initialize AMD GPU */ + amdgpu_device_initialize(*fd, &drm_major, &drm_minor, amd_dev); + + return DRM_SUCCESS; +} + +static void releaseAMDDevice(amdgpu_device_handle amd_dev, int fd) { + amdgpu_device_deinitialize(amd_dev); + + close(fd); +} + +int localDRMDeviceGetName(drmDevicePtr device, char name[64]) { + amdgpu_device_handle amdgpu_device; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + strncpy(name, amdgpu_get_marketing_name(amdgpu_device), 63); + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetGPUInfo(drmDevicePtr device, struct amdgpu_gpu_info *info) { + amdgpu_device_handle amdgpu_device; + + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + if (amdgpu_query_gpu_info(amdgpu_device, info) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetMemoryClock(drmDevicePtr device, uint32_t *value) { + amdgpu_device_handle amdgpu_device; + + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + /* The GPU speed is the memory clock (GFX_MCLK) + * The GDDRx memory speed is the shader clock (GFX_SCLK) + */ if (amdgpu_query_sensor_info(amdgpu_device, AMDGPU_INFO_SENSOR_GFX_SCLK, sizeof(*value), value) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetGPUClock(drmDevicePtr device, uint32_t *value) { + amdgpu_device_handle amdgpu_device; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + /* The GPU speed is the memory clock (GFX_MCLK) + * The GDDRx memory speed is the shader clock (GFX_SCLK) + */ + if (amdgpu_query_sensor_info(amdgpu_device, AMDGPU_INFO_SENSOR_GFX_MCLK, sizeof(*value), value) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetTemperature(drmDevicePtr device, uint32_t *value) { + amdgpu_device_handle amdgpu_device; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + if (amdgpu_query_sensor_info(amdgpu_device, AMDGPU_INFO_SENSOR_GPU_TEMP, sizeof(*value), value) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetGPULoad(drmDevicePtr device, uint32_t *value) { + amdgpu_device_handle amdgpu_device; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + if (amdgpu_query_sensor_info(amdgpu_device, AMDGPU_INFO_SENSOR_GPU_LOAD, sizeof(*value), value) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetGPUAveragePower(drmDevicePtr device, uint32_t *value) { + amdgpu_device_handle amdgpu_device; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + if (amdgpu_query_sensor_info(amdgpu_device, AMDGPU_INFO_SENSOR_GPU_AVG_POWER, sizeof(*value), value) < 0) + return DRM_ERROR_NO_DATA; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +int localDRMDeviceGetMemoryInfo(drmDevicePtr device, drmMemory_t *memory) { + amdgpu_device_handle amdgpu_device; + struct drm_amdgpu_memory_info mem; + int fd = -1; + int status = DRM_SUCCESS; + + if ((status = getAMDDevice(device, &amdgpu_device, &fd)) != DRM_SUCCESS) + return status; + + if (amdgpu_query_info(amdgpu_device, AMDGPU_INFO_MEMORY, sizeof(mem), &mem) < 0) + return DRM_ERROR_NO_DATA; + + memory->total = mem.vram.total_heap_size; + memory->usable = mem.vram.usable_heap_size; + memory->used = mem.vram.heap_usage; + + releaseAMDDevice(amdgpu_device, fd); + + return DRM_SUCCESS; +} + +const char *localDRMErrStr(drmReturn_t sts) { + int i; + static const char *unknown = "No such error code"; + static struct { + int code; + const char *msg; + } table[] = { + {DRM_SUCCESS, "The operation was successful"}, + {DRM_ERROR_NOT_FOUND, "A query to find an object was unsuccessful"}, + {DRM_ERROR_MEMORY, "Not enough memory available"}, + {DRM_ERROR_NO_DATA, "No data available for this request"}, + {DRM_ERROR_INSUFFICIENT_RESOURCES, "Unable to open file, not enough resources"}, + {DRM_ERROR_UNKNOWN, "An internal driver error occurred"}}; + + for (i = 0; i < (sizeof(table) / sizeof(table[0])); i++) { + if (table[i].code == sts) + return table[i].msg; + } + return unknown; +} diff --git a/src/pmdas/amdgpu/localdrm.h b/src/pmdas/amdgpu/localdrm.h new file mode 100644 index 0000000000..02653b1f76 --- /dev/null +++ b/src/pmdas/amdgpu/localdrm.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#ifndef _LOCAL_DRM_H +#define _LOCAL_DRM_H + +#include +#include +#include + +/* Error codes */ +typedef enum { + DRM_SUCCESS = 0, + DRM_ERROR_NOT_FOUND, + DRM_ERROR_MEMORY, + DRM_ERROR_NO_DATA, + DRM_ERROR_INSUFFICIENT_RESOURCES, + DRM_ERROR_UNKNOWN = 99 +} drmReturn_t; + +typedef struct { + uint64_t total; + uint64_t usable; + uint64_t used; +} drmMemory_t; + +extern int localDRMShutdown(drmDevicePtr [], uint32_t); +extern const char *localDRMErrStr(drmReturn_t); + +extern int localDRMDeviceGetDevices(drmDevicePtr *[], uint32_t *, uint32_t *); +extern int localDRMDeviceGetName(drmDevicePtr, char [64]); +extern int localDRMDeviceGetGPUInfo(drmDevicePtr, struct amdgpu_gpu_info *); +extern int localDRMDeviceGetMemoryClock(drmDevicePtr, uint32_t *); +extern int localDRMDeviceGetGPUClock(drmDevicePtr, uint32_t *); +extern int localDRMDeviceGetTemperature(drmDevicePtr, uint32_t *); +extern int localDRMDeviceGetGPULoad(drmDevicePtr, uint32_t *); +extern int localDRMDeviceGetGPUAveragePower(drmDevicePtr, uint32_t *); +extern int localDRMDeviceGetMemoryInfo(drmDevicePtr, drmMemory_t *); +#endif /* _LOCAL_DRM_H */ diff --git a/src/pmdas/amdgpu/pmns b/src/pmdas/amdgpu/pmns new file mode 100644 index 0000000000..9d61709a74 --- /dev/null +++ b/src/pmdas/amdgpu/pmns @@ -0,0 +1,33 @@ +/* + * Metrics for amd GPU PMDA + * + * Copyright (c) 2024 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +amdgpu { + numcards AMDGPU:0:0 + cardname AMDGPU:0:1 + memused AMDGPU:0:2 + memtotal AMDGPU:0:3 + memfree AMDGPU:0:4 + samples AMDGPU:0:5 + memusedaccum AMDGPU:0:6 + mem_clock AMDGPU:0:7 + mem_clock_max AMDGPU:0:8 + gpu_clock AMDGPU:0:9 + gpu_clock_max AMDGPU:0:10 + temperature AMDGPU:0:11 + load AMDGPU:0:12 + avg_pwr AMDGPU:0:13 +} + diff --git a/src/pmdas/amdgpu/root b/src/pmdas/amdgpu/root new file mode 100644 index 0000000000..9ff9ab51dd --- /dev/null +++ b/src/pmdas/amdgpu/root @@ -0,0 +1,10 @@ +/* + * fake "root" for validating the local PMNS subtree + */ + +#include + +root { amdgpu } + +#include "pmns" + diff --git a/src/pmns/stdpmid.pcp b/src/pmns/stdpmid.pcp index e186791dbe..41b2dfe51d 100644 --- a/src/pmns/stdpmid.pcp +++ b/src/pmns/stdpmid.pcp @@ -141,6 +141,7 @@ OVERHEAD 158 RESCTRL 159 FARM 160 /* Seagate vendor specific Field Accessible Reliability Metrics log */ UWSGI 161 +AMDGPU 162 ### FREE SLOTS ### SCHIZO 241 SLOW_PYTHON 242