Skip to content

Commit

Permalink
[FIXUP] gpu metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
nichamon committed Jul 10, 2023
1 parent 34cd863 commit 6ffb5d3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 27 deletions.
7 changes: 4 additions & 3 deletions ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,12 @@

#include "gmg_log.h"

extern ovis_log_t mylog;
extern ovis_log_t __gmg_log;

ovis_log_t setGmgLoggingFunction(
const ovis_log_t _mylog) {
ovis_log_t oldPf = mylog = _mylog;
const ovis_log_t pi_log) {
ovis_log_t oldPf = __gmg_log;
GMGLOG(OVIS_LDEBUG, "Updated msglog\n");
__gmg_log = pi_log;
return oldPf;
}
6 changes: 3 additions & 3 deletions ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_log.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,18 @@
#include "ovis_log/ovis_log.h"
#include "ldmsd.h" // contains log function prototype; return type of log function is void.

extern ovis_log_t mylog;
ovis_log_t __gmg_log;

/**
* The following are provided for convenience, since msglog is fully accessible.
*/
// GMGLOG() is only used in gather_gpu_metrics_from_one_api.c and gmg_ldms_util.c.
#define GMGLOG(LEVEL, FMT, ...) do { \
ovis_log(mylog, (LEVEL), (FMT), ##__VA_ARGS__); \
ovis_log(__gmg_log, (LEVEL), (FMT), ##__VA_ARGS__); \
} while (0)

ovis_log_t setGmgLoggingFunction(
const ovis_log_t mylog // ovis_log_t is already a pointer type
const ovis_log_t pi_log // ovis_log_t is already a pointer type
);

#endif // _GMG_LOG_H_
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
#include "gmg_ldms_util.h"
#include "gather_gpu_metrics_from_one_api.h"

static ovis_log_t mylog;
static ovis_log_t __gpu_metrics_log;

static uint32_t g_numberOfDevicesInSchema = 0;

Expand Down Expand Up @@ -104,13 +104,13 @@ void free_base() {
ze_driver_handle_t getGpuDriver() {
ze_result_t res = initializeOneApi(); // only slow the first time it is called for each process
if (res != ZE_RESULT_SUCCESS) {
ovis_log(mylog, OVIS_LERROR, "!!!initializeOneApi() => 0x%x\n", res);
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!initializeOneApi() => 0x%x\n", res);
return NULL;
}

ze_driver_handle_t hDriver = getDriver();
if (hDriver == NULL) {
ovis_log(mylog, OVIS_LERROR, "!!!getDriver() => NULL\n");
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!getDriver() => NULL\n");
return NULL;
}

Expand All @@ -130,14 +130,14 @@ static int create_metric_set_schema_and_set(base_data_t base) {

ze_driver_handle_t hDriver = getGpuDriver();
if (hDriver == NULL) {
ovis_log(mylog, OVIS_LERROR, "!!!getGpuDriver() => NULL\n");
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!getGpuDriver() => NULL\n");
goto err;
}

uint32_t numDevices = 0;
ze_device_handle_t *phDevices = enumerateGpuDevices(hDriver, &numDevices);
if (phDevices == NULL) {
ovis_log(mylog, OVIS_LERROR, "!!!enumerateGpuDevices(&numDevices=%p) => NULL, %d\n", &numDevices, numDevices);
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!enumerateGpuDevices(&numDevices=%p) => NULL, %d\n", &numDevices, numDevices);
goto err;
}
freeZeDeviceHandle(phDevices);
Expand All @@ -148,7 +148,7 @@ static int create_metric_set_schema_and_set(base_data_t base) {

schema = base_schema_new(base);
if (!schema) {
ovis_log(mylog, OVIS_LERROR,
ovis_log(__gpu_metrics_log, OVIS_LERROR,
"!!!%s: The schema '%s' could not be created, errno=%d.\n",
__FILE__, base->schema_name, errno);
rc = errno;
Expand Down Expand Up @@ -184,7 +184,7 @@ static int create_metric_set_schema_and_set(base_data_t base) {
static void printValList(const char *szListName, struct attr_value_list *av_list) {
size_t listSize = MIN(av_list->count, av_list->size);
for (size_t i = 0; i < listSize; i++) {
ovis_log(mylog, OVIS_LDEBUG, "%s[%d] = %s:%s\n",
ovis_log(__gpu_metrics_log, OVIS_LDEBUG, "%s[%d] = %s:%s\n",
szListName, i, av_name(av_list, i), av_value_at_idx(av_list, i));
}
}
Expand All @@ -201,7 +201,7 @@ static int config_check(struct attr_value_list *keyword_list, struct attr_value_
for (i = 0; i < (sizeof(deprecated) / sizeof(deprecated[0])); i++) {
value = av_value(attribute_value_list, deprecated[i]);
if (value) {
ovis_log(mylog, OVIS_LERROR, SAMP ": !!!config argument %s has been deprecated.\n",
ovis_log(__gpu_metrics_log, OVIS_LERROR, SAMP ": !!!config argument %s has been deprecated.\n",
deprecated[i]);
return EINVAL;
}
Expand Down Expand Up @@ -234,7 +234,7 @@ static int config(struct ldmsd_plugin *self,

if (getSimulationMode() == true) {
// Log this ERROR so that it appears in /opt/clmgr/log/ldms_sampler.log
ovis_log(mylog, OVIS_LERROR, "Simulation mode is ON\n"); // no really an error so don't prefix with '!!!'
ovis_log(__gpu_metrics_log, OVIS_LERROR, "Simulation mode is ON\n"); // no really an error so don't prefix with '!!!'
}

printValList("keyword_list", keyword_list);
Expand All @@ -243,7 +243,7 @@ static int config(struct ldmsd_plugin *self,
int rc;

if (set) {
ovis_log(mylog, OVIS_LERROR, SAMP ": !!!Set already created.\n");
ovis_log(__gpu_metrics_log, OVIS_LERROR, SAMP ": !!!Set already created.\n");
return EINVAL;
}

Expand All @@ -254,7 +254,7 @@ static int config(struct ldmsd_plugin *self,

// Create an instance from the base "class". This is effectively calling
// the base class constructor.
base = base_config(attribute_value_list, SAMP, SAMP, mylog);
base = base_config(attribute_value_list, SAMP, SAMP, __gpu_metrics_log);
if (!base) {
rc = errno;
goto err;
Expand All @@ -264,7 +264,7 @@ static int config(struct ldmsd_plugin *self,
// is considered well-defined after the metric set schema is defined.
rc = create_metric_set_schema_and_set(base);
if (rc) {
ovis_log(mylog, OVIS_LERROR, SAMP ": !!!failed to create a metric set.\n");
ovis_log(__gpu_metrics_log, OVIS_LERROR, SAMP ": !!!failed to create a metric set.\n");
goto err;
}

Expand Down Expand Up @@ -292,20 +292,20 @@ static ldms_set_t get_set(struct ldmsd_sampler *self) {
*/
static int sample(struct ldmsd_sampler *self) {
if (!set) {
ovis_log(mylog, OVIS_LDEBUG, SAMP ": plugin not initialized\n");
ovis_log(__gpu_metrics_log, OVIS_LDEBUG, SAMP ": plugin not initialized\n");
return EINVAL;
}

ze_driver_handle_t hDriver = getGpuDriver();
if (hDriver == NULL) {
ovis_log(mylog, OVIS_LERROR, "!!!getGpuDriver() => NULL\n");
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!getGpuDriver() => NULL\n");
return EINVAL;
}

uint32_t numDevices = 0;
ze_device_handle_t *phDevices = enumerateGpuDevices(hDriver, &numDevices);
if (phDevices == NULL) {
ovis_log(mylog, OVIS_LERROR, "!!!enumerateGpuDevices(&numDevices=%p) => NULL, %d\n", &numDevices, numDevices);
ovis_log(__gpu_metrics_log, OVIS_LERROR, "!!!enumerateGpuDevices(&numDevices=%p) => NULL, %d\n", &numDevices, numDevices);
return EINVAL;
}
uint32_t numDevicesToSample = MIN(g_numberOfDevicesInSchema, numDevices); // cannot sample more than schema size
Expand All @@ -316,7 +316,7 @@ static int sample(struct ldmsd_sampler *self) {
size_t mallocCount = getMallocCount();
if (mallocCount != 1) {
// Only allocated memory is the device handler array.
ovis_log(mylog, OVIS_LERROR, SAMP ": !!!mallocCount=%ld != 1\n", mallocCount);
ovis_log(__gpu_metrics_log, OVIS_LERROR, SAMP ": !!!mallocCount=%ld != 1\n", mallocCount);
}

freeZeDeviceHandle(phDevices);
Expand All @@ -335,14 +335,14 @@ static void term(struct ldmsd_plugin *self) {
size_t mallocCount = getMallocCount();
if (mallocCount) {
// This following log message is never printed; maybe term was never called.
ovis_log(mylog, OVIS_LERROR, SAMP ": !!!mallocCount=%ld != 0\n", mallocCount);
ovis_log(__gpu_metrics_log, OVIS_LERROR, SAMP ": !!!mallocCount=%ld != 0\n", mallocCount);
}

free_base();
free_set();
free_schema();
if (mylog)
ovis_log_destroy(mylog);
if (__gpu_metrics_log)
ovis_log_destroy(__gpu_metrics_log);
}

/**
Expand All @@ -367,11 +367,12 @@ static struct ldmsd_sampler gpu_metrics_plugin = {
* @return plugin instance.
*/
struct ldmsd_plugin *get_plugin() {
mylog = ovis_log_register("sampler."SAMP, "Messages for the " SAMP " plugin");
if (!mylog) {
__gpu_metrics_log = ovis_log_register("sampler."SAMP, "Messages for the " SAMP " plugin");
if (!__gpu_metrics_log) {
ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's "
"log subsystem. Error %d.\n", errno);
}
setGmgLoggingFunction(__gpu_metrics_log);
set = NULL;
return &gpu_metrics_plugin.base;
}

0 comments on commit 6ffb5d3

Please sign in to comment.