Skip to content

Commit

Permalink
Add support for sending OOME events (#7253)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbachorik committed Jul 1, 2024
1 parent ffe2b6b commit cdecb42
Show file tree
Hide file tree
Showing 18 changed files with 1,038 additions and 60 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package datadog.communication.monitor;

import com.timgroup.statsd.Event;
import com.timgroup.statsd.ServiceCheck;
import datadog.trace.api.StatsDClient;
import java.util.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

final class DDAgentStatsDClient implements StatsDClient {
public final class DDAgentStatsDClient implements StatsDClient {
private static final Logger log = LoggerFactory.getLogger(DDAgentStatsDClient.class);
private final DDAgentStatsDConnection connection;
private final Function<String, String> nameMapping;
private final Function<String[], String[]> tagMapping;
Expand Down Expand Up @@ -84,6 +88,30 @@ public void serviceCheck(
connection.statsd.recordServiceCheckRun(serviceCheck);
}

/**
* Record a statsd event
*
* @param type the type of event (error, warning, info, success - @see Event.AlertType)
* @param source the source of the event (e.g. java, myapp, CrashTracking, Telemetry, etc)
* @param eventName the name of the event (or title)
* @param message the message of the event
* @param tags the tags to attach to the event
*/
public void recordEvent(
String type, String source, String eventName, String message, String... tags) {
Event.AlertType alertType = Event.AlertType.valueOf(type.toUpperCase());
log.debug(
"Recording event: {} - {} - {} - {} [{}]", alertType, source, eventName, message, tags);
Event.Builder eventBuilder =
Event.builder()
.withTitle(eventName)
.withText(message)
.withSourceTypeName(source)
.withDate(System.currentTimeMillis())
.withAlertType(alertType);
connection.statsd.recordEvent(eventBuilder.build(), tagMapping.apply(tags));
}

static ServiceCheck.Status serviceCheckStatus(final String status) {
switch (status) {
case "OK":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,47 @@ class DDAgentStatsDClientTest extends DDSpecification {
// spotless:on
}

def "single statsd client with event"() {
setup:
injectSysConfig(DOGSTATSD_START_DELAY, '0')
def server = new StatsDServer()
server.start()

def client = statsDClientManager().statsDClient('127.0.0.1', server.socket.localPort, null, namespace, constantTags as String[], false)

String[] tags = ["type:BufferPool", "jmx_domain:java.nio"]

expect:
client.recordEvent(eventType, "test", "test.event", "test event", tags)
def event = server.waitForMessage()
event.startsWith("_e{10,10}:test.event|test event|d:") && event.contains("|t:$expectedType|s:test|#$expectedTags")

cleanup:
client.close()
server.close()

where:
// spotless:off
namespace | eventType | expectedType | constantTags | expectedTags
null | "INFO" | "info" | null | "jmx_domain:java.nio,type:BufferPool"
null | "INFO" | "info" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
"example" | "INFO" | "info" | null | "jmx_domain:java.nio,type:BufferPool"
"example" | "INFO" | "info" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
null | "WARNING" | "warning" | null | "jmx_domain:java.nio,type:BufferPool"
null | "WARNING" | "warning" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
"example" | "WARNING" | "warning" | null | "jmx_domain:java.nio,type:BufferPool"
"example" | "WARNING" | "warning" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
null | "ERROR" | "error" | null | "jmx_domain:java.nio,type:BufferPool"
null | "ERROR" | "error" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
"example" | "ERROR" | "error" | null | "jmx_domain:java.nio,type:BufferPool"
"example" | "ERROR" | "error" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
null | "SUCCESS" | "success" | null | "jmx_domain:java.nio,type:BufferPool"
null | "SUCCESS" | "success" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
"example" | "SUCCESS" | "success" | null | "jmx_domain:java.nio,type:BufferPool"
"example" | "SUCCESS" | "success" | ["lang:java", "lang_version:1.8.0"] | "jmx_domain:java.nio,type:BufferPool,lang:java,lang_version:1.8.0"
// spotless:on
}

def "multiple statsd clients"() {
setup:
injectSysConfig(DOGSTATSD_START_DELAY, '0')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -638,8 +638,8 @@ private static synchronized void startJmx() {
if (jmxStarting.getAndSet(true)) {
return; // another thread is already in startJmx
}
// crash uploader initialization relies on JMX being available
initializeCrashUploader();
// error tracking initialization relies on JMX being available
initializeErrorTracking();
if (jmxFetchEnabled) {
startJmxFetch();
}
Expand Down Expand Up @@ -870,7 +870,7 @@ private static void stopTelemetry() {
}
}

private static void initializeCrashUploader() {
private static void initializeErrorTracking() {
if (Platform.isJ9()) {
// TODO currently crash tracking is supported only for HotSpot based JVMs
return;
Expand Down
11 changes: 5 additions & 6 deletions dd-java-agent/agent-crashtracking/build.gradle
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
apply from: "$rootDir/gradle/java.gradle"

// FIXME: Improve test coverage.
minimumBranchCoverage = 0.6
// runtime dependent parts (eg. looking up values from the JVM args) are not easy to exercise in unit tests
// the minimum coverage is reduced to reflect that
// minimumInstructionCoverage = 0.9
minimumInstructionCoverage = 0.7
// The functionality is tested in dd-smoke-tests/crashtracking

minimumBranchCoverage = 0.0
minimumInstructionCoverage = 0.0
excludedClassesCoverage += ['com.datadog.crashtracking.*']

tasks.withType(Test).configureEach { subTask ->
dependsOn ':dd-java-agent:shadowJar'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ public CrashUploader() {

ConfigProvider configProvider = config.configProvider();

System.out.println("===> telemetryUrl: " + telemetryUrl);
telemetryClient =
OkHttpUtils.buildHttpClient(
config,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.datadog.crashtracking;

import static datadog.communication.monitor.DDAgentStatsDClientManager.statsDClientManager;

import datadog.communication.monitor.DDAgentStatsDClient;
import de.thetaphi.forbiddenapis.SuppressForbidden;
import java.util.concurrent.locks.LockSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class OOMENotifier {
private static final Logger log = LoggerFactory.getLogger(OOMENotifier.class);

// This method is called via CLI so we don't need to be paranoid about the forbiddend APIs
@SuppressForbidden
public static void sendOomeEvent(String taglist) {
try (DDAgentStatsDClient client =
(DDAgentStatsDClient)
statsDClientManager().statsDClient(null, null, null, null, null, false)) {
String[] tags = taglist.split(",");
client.recordEvent(
"error",
"java",
"OutOfMemoryError",
"Java process encountered out of memory error",
tags);
log.info("OOME event sent");
LockSupport.parkNanos(2_000_000_000L); // wait 2s to allow statsd client flushing the event
}
}
}
Loading

0 comments on commit cdecb42

Please sign in to comment.