Skip to content

Commit

Permalink
proc: add amdgpu support
Browse files Browse the repository at this point in the history
Add per process GPU metrics for AMD GPUs.
The data is retrieved from `/proc/<pid>/fdinfo`.

The current implementation accumulates data for each process, walking
through all file descriptors and looking for drm and amd entries.

A future patch may be considered to separate the data per drm client ID.

Signed-off-by: Frédéric Bérat <[email protected]>
  • Loading branch information
fberat committed Jul 11, 2024
1 parent 19a9bc3 commit 136a5ca
Show file tree
Hide file tree
Showing 10 changed files with 383 additions and 1 deletion.
3 changes: 3 additions & 0 deletions qa/1222
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ _filter()
-e '/^proc\.id\.container: Missing metric value(s)/d' \
-e '/^proc\.psinfo\.labels: No value(s) available/d' \
-e '/^proc\.psinfo\.ngid: Metric not supported by this version/d' \
-e '/^proc\.fdinfo\..*: Metric not supported/d' \
-e '/^proc\.fdinfo\..*: No value(s) available/d' \
-e '/pmdaFetch: Fetch callback error from metric PMID 3\.11\.0\[.*]: No data available/d' \
-e '/proc\.psinfo\.tty_pgrp: No value(s) available/d' \
-e '/ acct: existing pacct file did not grow /d' \
Expand Down Expand Up @@ -105,6 +107,7 @@ NF == 0 && seen == 1 { if (numval == 1) print metric ": 1 value"
-e '/^proc\.psinfo\.ngid:/d' \
-e '/^proc\.psinfo\.tty_pgrp:/d' \
-e '/^proc\.smaps\./d' \
-e '/^proc\.fdinfo\./d' \
-e '/^Command: /s/,proc_init .*/,proc_init ... metrics .../' \
-e '/ERROR SUMMARY/q' \
# end
Expand Down
2 changes: 2 additions & 0 deletions qa/364
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ _filter_linux()
{
# pcp-atop uses metrics not supported on some kernels
# proc.smaps.* metrics are not present for older kernels
# proc.fdinfo.* metrics neither
#
if [ $PCP_PLATFORM != linux ]
then
Expand All @@ -310,6 +311,7 @@ _filter_linux()
-e '/^proc\.psinfo\.cgroups -12351 Missing metric value(s)/d' \
-e '/^proc\.namespaces\.envid -12350 Metric not supported/d' \
-e '/^hotproc\.namespaces\.envid -12350 Metric not supported/d' \
-e '/^proc\.fdinfo\..* Metric not supported/d' \
# linux
fi
}
Expand Down
4 changes: 3 additions & 1 deletion src/pmdas/linux_proc/clusters.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@
#define CLUSTER_PID_AUTOGROUP 74 /* /proc/<pid>/autogroup */
#define CLUSTER_HOTPROC_PID_AUTOGROUP 75 /* /proc/<pid>/autogroup */
#define CLUSTER_CGROUP2_IRQ_PRESSURE 76
#define CLUSTER_PID_FDINFO 77 /* /proc/<pid>/fdinfo */
#define CLUSTER_HOTPROC_PID_FDINFO 78 /* /proc/<pid>/fdinfo */

#define MIN_CLUSTER 8 /* first cluster number we use here */
#define MAX_CLUSTER 77 /* one more than highest cluster number used */
#define MAX_CLUSTER 79 /* one more than highest cluster number used */

#endif /* _CLUSTERS_H */
1 change: 1 addition & 0 deletions src/pmdas/linux_proc/help
Original file line number Diff line number Diff line change
Expand Up @@ -591,3 +591,4 @@ of accounting information:
0 inactive (no information available)
1 system (system level accounting from whatever file accton(8) is using)
2 private (accounting records from $PCP_VAR_DIR/pmcd/pacct)

14 changes: 14 additions & 0 deletions src/pmdas/linux_proc/help_text.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,18 @@ help_text_t help_text[] = {
{ .name = "autogroup.enabled", .shorthelp = "Scheduling autogroup feature for CFS is enabled in the kernel", .longhelp = "Contents of /proc/sys/kernel/sched_autogroup_enabled as described in sched(7)." },
{ .name = "autogroup.id", .shorthelp = "Process autogroup identifier from /proc/<pid>/autogroup", .longhelp = "Process scheduling autogroup identifier as described in sched(7)." },
{ .name = "autogroup.nice", .shorthelp = "Process autogroup nice level from /proc/<pid>/autogroup", .longhelp = "Process scheduling autogroup nice level as described in sched(7)." },

{ .name = "fdinfo.drm_memory_cpu", .shorthelp = "Accumulation of the drm-memory-cpu field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "CPU memory which can be used by the GPU to store buffer objects." },
{ .name = "fdinfo.drm_memory_gtt", .shorthelp = "Accumulation of the drm-memory-gtt field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "GTT memory which can be used by the GPU to store buffer objects." },
{ .name = "fdinfo.drm_memory_vram", .shorthelp = "Accumulation of the drm-memory-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "VRAM memory which can be used by the GPU to store buffer objects." },
{ .name = "fdinfo.drm_shared_cpu", .shorthelp = "Accumulation of the drm-shared-cpu field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "CPU memory which can be used by the GPU to store buffer objects, and is shared with another file." },
{ .name = "fdinfo.drm_shared_gtt", .shorthelp = "Accumulation of the drm-shared-gtt field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "GTT memory which can be used by the GPU to store buffer objects, and is shared with another file." },
{ .name = "fdinfo.drm_shared_vram", .shorthelp = "Accumulation of the drm-shared-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "VRAM memory which can be used by the GPU to store buffer objects, and is shared with another file." },

{ .name = "fdinfo.amd_evicted_visible_vram", .shorthelp = "Accumulation of the amd-evicted-visible-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "Sum of evicted buffers due to CPU access." },
{ .name = "fdinfo.amd_evicted_vram", .shorthelp = "Accumulation of the amd-evicted-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "Sum of evicted buffers, includes visible VRAM" },
{ .name = "fdinfo.amd_memory_visible_vram", .shorthelp = "Accumulation of the amd-memory-visible-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "Current visible VRAM usage" },
{ .name = "fdinfo.amd_requested_gtt", .shorthelp = "Accumulation of the amd-requested-gtt field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "How much GTT memory userspace asked for" },
{ .name = "fdinfo.amd_requested_visible_vram", .shorthelp = "Accumulation of the amd-requested-visible-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "How much visible VRAM userspace asked for" },
{ .name = "fdinfo.amd_requested_vram", .shorthelp = "Accumulation of the amd-requested-vram field from /proc/<pid>/fdinfo/* file descriptors", .longhelp = "How much VRAM userspace asked for, includes visible VRAM" },
};
104 changes: 104 additions & 0 deletions src/pmdas/linux_proc/pmda.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "../linux/convert.h"

#include <ctype.h>
#include <sys/syslog.h>
#include <unistd.h>
#include <sys/vfs.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -1348,6 +1349,52 @@ static pmdaMetric metrictab[] = {
/* acct.control.state */
{ NULL, { PMDA_PMID(CLUSTER_ACCT,CONTROL_ACCT_STATE), PM_TYPE_32, PM_INDOM_NULL,
PM_SEM_DISCRETE, PMDA_PMUNITS(0,0,0,0,0,0) }, },

/*
* Fdinfo cluster
*/

/* proc.fdinfo.drm_memory */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,0), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.memory_cpu */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,1), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.memory_gtt */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,2), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.memory_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,3), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.shared_cpu */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,4), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.shared_gtt */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,5), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.drm.shared_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,6), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},

/* proc.fdinfo.amd.evicted_visible_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,7), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.amd.evicted_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,8), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.amd.memory_visible_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,9), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.amd.requested_gtt */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,10), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.amd.requested_visible_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,11), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},
/* proc.fdinfo.amd.requested_vram */
{ NULL, { PMDA_PMID(CLUSTER_PID_FDINFO,12), PM_TYPE_U64, PROC_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}},

};

pmInDom
Expand Down Expand Up @@ -1444,6 +1491,7 @@ proc_refresh(pmdaExt *pmda, int *need_refresh)
need_refresh[CLUSTER_PID_CWD] ||
need_refresh[CLUSTER_PID_EXE] ||
need_refresh[CLUSTER_PID_FD] ||
need_refresh[CLUSTER_PID_FDINFO] ||
need_refresh[CLUSTER_PROC_RUNQ]) {
refresh_proc_pid(&proc_pid,
need_refresh[CLUSTER_PROC_RUNQ]? &proc_runq : NULL,
Expand All @@ -1464,6 +1512,7 @@ proc_refresh(pmdaExt *pmda, int *need_refresh)
need_refresh[CLUSTER_HOTPROC_PID_CWD] ||
need_refresh[CLUSTER_HOTPROC_PID_EXE] ||
need_refresh[CLUSTER_HOTPROC_PID_FD] ||
need_refresh[CLUSTER_HOTPROC_PID_FDINFO] ||
need_refresh[CLUSTER_HOTPROC_GLOBAL] ||
need_refresh[CLUSTER_HOTPROC_PRED]){
refresh_hotproc_pid(&hotproc_pid,
Expand Down Expand Up @@ -1495,6 +1544,7 @@ proc_instance(pmInDom indom, int inst, char *name, pmInResult **result, pmdaExt
need_refresh[CLUSTER_PID_CWD]++;
need_refresh[CLUSTER_PID_IO]++;
need_refresh[CLUSTER_PID_FD]++;
need_refresh[CLUSTER_PID_FDINFO]++;
break;
case HOTPROC_INDOM:
need_refresh[CLUSTER_HOTPROC_PID_STAT]++;
Expand All @@ -1511,6 +1561,7 @@ proc_instance(pmInDom indom, int inst, char *name, pmInResult **result, pmdaExt
need_refresh[CLUSTER_HOTPROC_PID_FD]++;
need_refresh[CLUSTER_HOTPROC_GLOBAL]++;
need_refresh[CLUSTER_HOTPROC_PRED]++;
need_refresh[CLUSTER_HOTPROC_PID_FDINFO]++;
break;

case CGROUP_CPUSET_INDOM:
Expand Down Expand Up @@ -3430,6 +3481,59 @@ proc_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
}
break;

case CLUSTER_HOTPROC_PID_FDINFO:
active_proc_pid = &hotproc_pid;
/*FALLTHROUGH*/
case CLUSTER_PID_FDINFO:
if (!have_access)
return PM_ERR_PERMISSION;
if ((entry = fetch_proc_pid_fdinfo(inst, active_proc_pid, &sts)) == NULL)
return sts;
if (!(entry->success & PROC_PID_FLAG_FDINFO))
return 0;

switch (item) {
case 0: /* proc.fdinfo.drm.memory_cpu */
atom->ull = entry->fdinfo.drm_memory_cpu;
break;
case 1: /* proc.fdinfo.drm.memory_gtt */
atom->ull = entry->fdinfo.drm_memory_gtt;
break;
case 2: /* proc.fdinfo.drm.memory_vram */
atom->ull = entry->fdinfo.drm_memory_vram;
break;
case 3: /* proc.fdinfo.drm.shared_cpu */
atom->ull = entry->fdinfo.drm_shared_cpu;
break;
case 4: /* proc.fdinfo.drm.shared_gtt */
atom->ull = entry->fdinfo.drm_shared_gtt;
break;
case 5: /* proc.fdinfo.drm.shared_vram */
atom->ull = entry->fdinfo.drm_shared_vram;
break;

case 6: /* proc.fdinfo.amd.evicted_visible_vram */
atom->ull = entry->fdinfo.amd_evicted_visible_vram;
break;
case 7: /* proc.fdinfo.amd.evicted_vram */
atom->ull = entry->fdinfo.amd_evicted_vram;
break;
case 8: /* proc.fdinfo.amd.memory_visible_vram */
atom->ull = entry->fdinfo.amd_memory_visible_vram;
break;
case 9: /* proc.fdinfo.amd.requested_gtt */
atom->ull = entry->fdinfo.amd_requested_gtt;
break;
case 10: /* proc.fdinfo.amd.requested_visible_vram */
atom->ull = entry->fdinfo.amd_requested_visible_vram;
break;
case 11: /* proc.fdinfo.amd.requested_vram */
atom->ull = entry->fdinfo.amd_requested_vram;
break;
default: /* unknown cluster */
return PM_ERR_PMID;
}
break;
default: /* unknown cluster */
return PM_ERR_PMID;
}
Expand Down
18 changes: 18 additions & 0 deletions src/pmdas/linux_proc/proc_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ enum {
DYNPROC_GROUP_NAMESPACE,
DYNPROC_GROUP_SMAPS,
DYNPROC_GROUP_AUTOGROUP,
DYNPROC_GROUP_FDINFO,

NUM_DYNPROC_GROUPS
};
Expand Down Expand Up @@ -69,6 +70,7 @@ static int proc_hotproc_cluster_list[][2] = {
{ CLUSTER_PID_EXE, CLUSTER_HOTPROC_PID_EXE },
{ CLUSTER_PID_CWD, CLUSTER_HOTPROC_PID_CWD },
{ CLUSTER_PID_AUTOGROUP, CLUSTER_HOTPROC_PID_AUTOGROUP },
{ CLUSTER_PID_FDINFO, CLUSTER_HOTPROC_PID_FDINFO },
};


Expand Down Expand Up @@ -257,6 +259,21 @@ static dynproc_metric_t smaps_metrics[] = {
{ .name = "pss_dirty", .cluster = CLUSTER_PID_SMAPS, .item=20 },
};

static dynproc_metric_t fdinfo_metrics[] = {
{ .name = "drm_memory_cpu", .cluster = CLUSTER_PID_FDINFO, .item=0 },
{ .name = "drm_memory_gtt", .cluster = CLUSTER_PID_FDINFO, .item=1 },
{ .name = "drm_memory_vram", .cluster = CLUSTER_PID_FDINFO, .item=2 },
{ .name = "drm_shared_cpu", .cluster = CLUSTER_PID_FDINFO, .item=3 },
{ .name = "drm_shared_gtt", .cluster = CLUSTER_PID_FDINFO, .item=4 },
{ .name = "drm_shared_vram", .cluster = CLUSTER_PID_FDINFO, .item=5 },
{ .name = "amd_evicted_visible_vram", .cluster = CLUSTER_PID_FDINFO, .item=6 },
{ .name = "amd_evicted_vram", .cluster = CLUSTER_PID_FDINFO, .item=7 },
{ .name = "amd_memory_visible_vram", .cluster = CLUSTER_PID_FDINFO, .item=8 },
{ .name = "amd_requested_gtt", .cluster = CLUSTER_PID_FDINFO, .item=9 },
{ .name = "amd_requested_visible_vram", .cluster = CLUSTER_PID_FDINFO, .item=10 },
{ .name = "amd_requested_vram", .cluster = CLUSTER_PID_FDINFO, .item=11 },
};

static dynproc_group_t dynproc_groups[] = {
[DYNPROC_GROUP_PSINFO] = { .name = "psinfo", .metrics = psinfo_metrics, .nmetrics = sizeof(psinfo_metrics)/sizeof(dynproc_metric_t)},
[DYNPROC_GROUP_ID] = { .name = "id", .metrics = id_metrics, .nmetrics = sizeof(id_metrics)/sizeof(dynproc_metric_t)},
Expand All @@ -267,6 +284,7 @@ static dynproc_group_t dynproc_groups[] = {
[DYNPROC_GROUP_NAMESPACE] = { .name = "namespaces", .metrics = namespace_metrics, .nmetrics = sizeof(namespace_metrics)/sizeof(dynproc_metric_t) },
[DYNPROC_GROUP_SMAPS] = { .name = "smaps", .metrics = smaps_metrics, .nmetrics = sizeof(smaps_metrics)/sizeof(dynproc_metric_t)},
[DYNPROC_GROUP_AUTOGROUP] = { .name = "autogroup", .metrics = autogroup_metrics, .nmetrics = sizeof(autogroup_metrics)/sizeof(dynproc_metric_t) },
[DYNPROC_GROUP_FDINFO] = { .name = "fdinfo", .metrics = fdinfo_metrics, .nmetrics = sizeof(fdinfo_metrics)/sizeof(dynproc_metric_t) },
};

/*
Expand Down
Loading

0 comments on commit 136a5ca

Please sign in to comment.