diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java
new file mode 100644
index 0000000000000..a93f09d42b767
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsService.java
@@ -0,0 +1,276 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager;
+
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.VisibleForTesting;
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FileContent;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.IssueData;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.IssueType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.*;
+
+/**
+ * Utility methods to launch the diagnostic scripts.
+ */
+@InterfaceAudience.Private
+public final class DiagnosticsService {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DiagnosticsService.class);
+ private static final String PYTHON_COMMAND = "python3";
+ private static final String COLON = ":";
+ private static final String COMMA = ",";
+ private static final String OUT_DIR_PREFIX = "/tmp";
+ private static final String EXECUTION_ERROR_MESSAGE = "Error occurred " +
+ "during the execution of the diagnostic script with the command '{}'.";
+ private static final String INCORRECT_NUMBER_OF_PARAMETERS_MESSAGE =
+ "Error while parsing diagnostic option, incorrect number of " +
+ "parameters. Expected 1 or 2, but got {}. Skipping this option.";
+ private static String scriptLocation;
+
+ private DiagnosticsService() {
+ // hidden constructor
+ }
+
+ public static CommonIssues listCommonIssues() throws Exception {
+ if (Shell.WINDOWS) {
+ throw new UnsupportedOperationException("Not implemented for Windows.");
+ }
+ CommonIssues issueTypes = new CommonIssues();
+ ProcessBuilder pb = createProcessBuilder(CommandArgument.LIST_ISSUES);
+
+ List result = executeCommand(pb);
+
+ System.out.println("-----------------------------Common Issue Result: " + result);
+
+ for (String line : result) {
+ issueTypes.add(parseIssueType(line));
+ }
+
+ return issueTypes;
+ }
+
+ public static IssueData collectIssueData(String issueId, List args)
+ throws Exception {
+ if (Shell.WINDOWS) {
+ throw new UnsupportedOperationException("Not implemented for Windows.");
+ }
+ ProcessBuilder pb = createProcessBuilder(CommandArgument.COMMAND, issueId,
+ args);
+
+ LOG.info("Diagnostic process environment: {}", pb.environment());
+
+ List result = executeCommand(pb);
+ Optional outputDirectory = result.stream()
+ .filter(e -> e.contains(OUT_DIR_PREFIX))
+ .findFirst();
+
+ if (!outputDirectory.isPresent()) {
+ LOG.error(EXECUTION_ERROR_MESSAGE, pb.command());
+ throw new IOException("Output directory in result not found.");
+ }
+
+
+ return collectIssueFilesContent(new File(outputDirectory.get()));
+ }
+
+ public static IssueData collectIssueFilesContent(File currentDir){
+ IssueData issueData = new IssueData();
+ File[] files = currentDir.listFiles();
+
+ if (!currentDir.exists()){
+ LOG.error("Directory does not exist: {}", currentDir);
+ return issueData;
+ }
+
+ if (files == null) {
+ return issueData;
+ }
+
+ for (File file : files) {
+ try {
+ if (file.isDirectory()) {
+ issueData.getFiles().addAll(collectIssueFilesContent(file).getFiles());
+ } else {
+ String contentType = Files.probeContentType(file.toPath());
+ if (contentType == null) contentType = "text/plain"; // Default if not found
+
+ String content = new String(Files.readAllBytes(file.toPath()), StandardCharsets.UTF_8);
+ FileContent fileContent = new FileContent();
+ fileContent.setFilename(currentDir.toPath().relativize(file.toPath()).toString()); // Set the relative path as name
+ fileContent.setContentType(contentType);
+ fileContent.setContent(content);
+
+ issueData.getFiles().add(fileContent);
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to process {}: {}", file, e.getMessage());
+ }
+ }
+ return issueData;
+ }
+
+ @VisibleForTesting
+ protected static ProcessBuilder createProcessBuilder(CommandArgument argument) throws IOException {
+ return createProcessBuilder(argument, null, null);
+ }
+
+ @VisibleForTesting
+ protected static ProcessBuilder createProcessBuilder(
+ CommandArgument argument, String issueId, List additionalArgs) throws IOException {
+ List commandList =
+ new ArrayList<>(Arrays.asList(PYTHON_COMMAND, getScriptLocation(),
+ argument.getShortOption()));
+
+ if (argument.equals(CommandArgument.COMMAND)) {
+ commandList.add(issueId);
+ if (additionalArgs != null) {
+ commandList.add(CommandArgument.ARGUMENTS.getShortOption());
+ commandList.addAll(additionalArgs);
+ }
+ }
+
+ return new ProcessBuilder(commandList);
+ }
+
+ private static List executeCommand(ProcessBuilder pb)
+ throws Exception {
+ Process process = pb.start();
+ int exitCode;
+ List result = new ArrayList<>();
+
+ try (
+ BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream(),
+ StandardCharsets.UTF_8));
+ BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream(),
+ StandardCharsets.UTF_8));
+ ) {
+
+ String line;
+ while ((line = stdoutReader.readLine()) != null) {
+ result.add(line);
+ }
+
+ List errors = new ArrayList<>();
+ while ((line = stderrReader.readLine()) != null) {
+ errors.add(line);
+ }
+ if (!errors.isEmpty()) {
+ LOG.error("Python script stderr: {}", errors);
+ }
+
+ process.waitFor();
+ } catch (Exception e) {
+ LOG.error(EXECUTION_ERROR_MESSAGE, pb.command());
+ throw e;
+ }
+ exitCode = process.exitValue();
+ if (exitCode != 0) {
+ throw new IOException("The collector script exited with non-zero " +
+ "exit code: " + exitCode);
+ }
+
+ return result;
+ }
+
+ @VisibleForTesting
+ protected static IssueType parseIssueType(String line) {
+ String[] issueParams = line.split(COLON);
+ IssueType parsedIssueType;
+
+ if (issueParams.length < 1 || issueParams.length > 2) {
+ LOG.warn(INCORRECT_NUMBER_OF_PARAMETERS_MESSAGE,
+ issueParams.length);
+ return null;
+ } else {
+ String name = issueParams[0];
+ parsedIssueType = new IssueType(name);
+ if (issueParams.length == 2) {
+ List parameterList =
+ Arrays.asList(issueParams[1].split(COMMA));
+ parsedIssueType.setParameters(parameterList);
+ }
+ }
+
+ return parsedIssueType;
+ }
+
+ @VisibleForTesting
+ protected static void setScriptLocation(String scriptLocationParam) {
+ scriptLocation = scriptLocationParam;
+ }
+
+ enum CommandArgument{
+ LIST_ISSUES("-l"),
+ COMMAND("-c"),
+ ARGUMENTS("-a");
+
+ private final String shortOption;
+
+ CommandArgument(String shortOption) {
+ this.shortOption = shortOption;
+ }
+
+ public CommandArgument fromString(String option) {
+ for (CommandArgument arg : CommandArgument.values()) {
+ if (arg.shortOption.equals(option)) {
+ return arg;
+ }
+ }
+ return null;
+ }
+
+ public String getShortOption() {
+ return shortOption;
+ }
+
+ }
+
+ private static String getScriptLocation() throws IOException {
+ if (scriptLocation != null) {
+ return scriptLocation;
+ }
+
+ InputStream in = DiagnosticsService.class.getClassLoader()
+ .getResourceAsStream("diagnostics/diagnostics_collector.py");
+
+ if (in == null) {
+ throw new FileNotFoundException(
+ "Resource diagnostics/diagnostics_collector.py not found in classpath");
+ }
+
+ File tempScript = File.createTempFile("diagnostics_collector", ".py");
+ Files.copy(in, tempScript.toPath(), StandardCopyOption.REPLACE_EXISTING);
+ tempScript.setExecutable(true);
+
+ scriptLocation = tempScript.getAbsolutePath();
+ return scriptLocation;
+ }
+
+
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java
index de75631486b92..c9c48eaa217cc 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWSConsts.java
@@ -42,6 +42,13 @@ public final class RMWSConsts {
/** Path for {@code RMWebServiceProtocol#getClusterMetricsInfo}. */
public static final String METRICS = "/metrics";
+ /** Path for {@code RMWebServices#getCommonIssueList}. */
+ public static final String COMMON_ISSUE_LIST = "/common-issues/list";
+
+ /** Path for {@code RMWebServices#getCommonIssueData}. */
+ public static final String COMMON_ISSUE_COLLECT =
+ "/common-issues/collect";
+
/** Path for {@code RMWebServiceProtocol#getSchedulerInfo}. */
public static final String SCHEDULER = "/scheduler";
@@ -219,6 +226,8 @@ public final class RMWSConsts {
// ----------------QueryParams for RMWebServiceProtocol----------------
+ public static final String ISSUEID = "issueId";
+ public static final String ISSUEARGS = "args";
public static final String TIME = "time";
public static final String STATES = "states";
public static final String NODEID = "nodeId";
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java
index 47de8ee591afd..0951c9f5e026c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServiceProtocol.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.webapp;
import java.io.IOException;
+import java.util.List;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
@@ -47,6 +48,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
@@ -112,6 +114,25 @@ public interface RMWebServiceProtocol {
*/
ClusterMetricsInfo getClusterMetricsInfo();
+ /**
+ * This method retrieves the common diagnosable issue list, and it is
+ * reachable by using {@link RMWSConsts#COMMON_ISSUE_LIST}.
+ *
+ * @return the list of available diagnostic cases
+ */
+ CommonIssues getCommonIssueList();
+
+ /**
+ * This method retrieves the diagnostic information for the selected issue,
+ * and it is reachable by using {@link RMWSConsts#COMMON_ISSUE_COLLECT}.
+ *
+ * @param issueId the selected issue's ID. It is a QueryParam.
+ * @param args the necessary arguments for diagnosing the issue.
+ * It is a QueryParam.
+ * @return the associated diagnostic information to the selected issue
+ */
+ Response getCommonIssueData(String issueId, List args);
+
/**
* This method retrieves the current scheduler status, and it is reachable by
* using {@link RMWSConsts#SCHEDULER}.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java
index b6ea2d438c0b4..329d49950213d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebServices.java
@@ -137,6 +137,7 @@
import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier;
import org.apache.hadoop.yarn.server.api.protocolrecords.UpdateNodeResourceRequest;
import org.apache.hadoop.yarn.server.resourcemanager.AdminService;
+import org.apache.hadoop.yarn.server.resourcemanager.DiagnosticsService;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
@@ -177,6 +178,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FifoSchedulerInfo;
@@ -223,6 +225,7 @@
import org.apache.hadoop.yarn.webapp.BadRequestException;
import org.apache.hadoop.yarn.webapp.ForbiddenException;
import org.apache.hadoop.yarn.webapp.NotFoundException;
+import org.apache.hadoop.yarn.webapp.WebAppException;
import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
import org.apache.hadoop.yarn.webapp.dao.ConfInfo;
import org.apache.hadoop.yarn.webapp.dao.SchedConfUpdateInfo;
@@ -408,6 +411,42 @@ public ClusterMetricsInfo getClusterMetricsInfo() {
return new ClusterMetricsInfo(this.rm);
}
+ @GET
+ @Path(RMWSConsts.COMMON_ISSUE_LIST)
+ @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
+ MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 })
+ @Override
+ public CommonIssues getCommonIssueList() {
+ initForReadableEndpoints();
+ try {
+ return DiagnosticsService.listCommonIssues();
+ } catch (Exception e) {
+ throw new WebAppException("Error collecting the common " +
+ "issue types. Error message: " + e.getMessage() + ". " +
+ "For more information please check the ResourceManager logs.");
+ }
+ }
+
+ @GET
+ @Path(RMWSConsts.COMMON_ISSUE_COLLECT)
+ @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
+ MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 })
+ @Override
+ public Response getCommonIssueData(
+ @QueryParam(RMWSConsts.ISSUEID) String issueId,
+ @QueryParam(RMWSConsts.ISSUEARGS) List args) {
+ initForReadableEndpoints();
+ try {
+ return Response.status(Status.OK)
+ .entity(DiagnosticsService.collectIssueData(issueId, args))
+ .build();
+ } catch (Exception e) {
+ throw new WebAppException("Error collecting the selected " +
+ "issue data. Error message: " + e.getMessage() + ". " +
+ "For more information please check the ResourceManager logs.");
+ }
+ }
+
@GET
@Path(RMWSConsts.SCHEDULER)
@Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java
new file mode 100644
index 0000000000000..5ccf36eba7bd5
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/CommonIssues.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
+
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.ArrayList;
+import java.util.List;
+
+@XmlRootElement(name = "commonIssues")
+@XmlAccessorType(XmlAccessType.FIELD)
+public class CommonIssues {
+ private List issue = new ArrayList<>();
+
+ public CommonIssues() {}
+
+ public void add(IssueType type) {
+ if (type != null &&
+ issue.stream().noneMatch(e-> e.getName().equals(type.getName()))) {
+ issue.add(type);
+ }
+ }
+
+ public List getIssueList() {
+ return issue;
+ }
+
+ public void addAll(List issueTypes) {
+ issue.addAll(issueTypes);
+ }
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/FileContent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/FileContent.java
new file mode 100644
index 0000000000000..1b7a6a2ffe8bd
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/FileContent.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
+
+import javax.xml.bind.annotation.XmlRootElement;
+
+@XmlRootElement
+public class FileContent {
+ private String filename;
+ private String contentType;
+ private String content;
+
+ public String getFilename() { return filename; }
+ public void setFilename(String filename) { this.filename = filename; }
+
+ public String getContentType() { return contentType; }
+ public void setContentType(String contentType) { this.contentType = contentType; }
+
+ public String getContent() { return content; }
+ public void setContent(String content) { this.content = content; }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueData.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueData.java
new file mode 100644
index 0000000000000..d205070be9a35
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueData.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.ArrayList;
+import java.util.List;
+
+@XmlRootElement
+public class IssueData {
+ private List files = new ArrayList<>();
+
+ @XmlElement(name = "file") // To make the endpoint result less ambiguous
+ public List getFiles() {
+ return files;
+ }
+
+ public void setFiles(List files) { this.files = files; }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java
new file mode 100644
index 0000000000000..27f96a0ae2b5d
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/IssueType.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
+
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+@XmlRootElement(name = "issue")
+@XmlAccessorType(XmlAccessType.FIELD)
+public class IssueType {
+ private String name;
+ private List parameters = new ArrayList<>();
+
+ public IssueType() {}
+
+ public IssueType(String name) {
+ this.name = name;
+ this.parameters = Collections.emptyList();
+ }
+
+ public IssueType(String name, List parameters) {
+ this.name = name;
+ this.parameters = new ArrayList<>(parameters);
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public List getParameters() {
+ return parameters;
+ }
+
+ public void setParameters(List parameters) {
+ this.parameters = parameters;
+ }
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java
new file mode 100644
index 0000000000000..8a0af4287f564
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/DiagnosticsServiceTest.java
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager;
+
+
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.IssueType;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static org.junit.Assert.fail;
+
+
+public class DiagnosticsServiceTest {
+ private static final String ISSUE_NAME_APP_DIAGNOSTIC = "application_diagnostic";
+ private static final String ISSUE_NAME_APP_FAILED = "application_failed";
+ private static final String ISSUE_NAME_APP_FAILED_NECESSARY_ARGS =
+ "application_failed_necessary_args";
+ private static final String ISSUE_NAME_APP_HANGING = "application_hanging";
+ private static final String ISSUE_NAME_SCHED_ISSUE =
+ "scheduler_related_issue";
+ private static final String ISSUE_NAME_RM_NM_ISSUE = "rm_nm_start_failure";
+ private static final String ISSUE_ARG_APP_ID = "appId";
+ private static final String ISSUE_ARG_NODE_ID = "nodeId";
+ private static final String COLON = ":";
+ private static final String COMMA = ",";
+ private static final String OUTPUT_DIR = "/tmp";
+
+ @Before
+ public void setUp() {
+ DiagnosticsService.setScriptLocation("src/test/resources/diagnostics" +
+ "/diagnostics_collector_test.py");
+ handleWindowsRuntime();
+ }
+
+ @Test
+ public void testListCommonIssuesValidCaseWithOptionsToBeSkipped()
+ throws Exception {
+ // The test script contains two invalid options: one with an ambiguous name
+ // and one with too many parameters. These should be skipped silently.
+ CommonIssues commonIssues = DiagnosticsService.listCommonIssues();
+
+ Assert.assertEquals(2, commonIssues.getIssueList().size());
+ assertIssueEquality(ISSUE_NAME_APP_DIAGNOSTIC,
+ Collections.singletonList(ISSUE_ARG_APP_ID),
+ commonIssues.getIssueList().get(0));
+
+ assertIssueEquality(ISSUE_NAME_SCHED_ISSUE,
+ Collections.emptyList(),
+ commonIssues.getIssueList().get(1));
+ }
+
+ @Test(expected = IOException.class)
+ public void testListCommonIssuesScriptMissing() throws Exception {
+ DiagnosticsService.setScriptLocation("/src/invalidLocation/script.py");
+ DiagnosticsService.listCommonIssues();
+ }
+
+// @Test
+// public void testCollectIssueDataPathValidOutput() throws Exception {
+// // valid case: the script prints out one directory
+// Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueDataPath(
+// ISSUE_NAME_APP_FAILED, null));
+// }
+//
+// @Test
+// public void testCollectIssueDataPathValidOutputWhenArgsArePresent()
+// throws Exception {
+// // valid case: appId and nodeId are necessary params and they are present
+// Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueDataPath(
+// ISSUE_NAME_APP_FAILED_NECESSARY_ARGS,
+// Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_NODE_ID)));
+// }
+//
+// @Test(expected = IOException.class)
+// public void testCollectIssueDataPathInvalidOutputWhenWrongArgsArePresent()
+// throws Exception {
+// // valid case: appId and nodeId are necessary params but two appIds are
+// // given
+// Assert.assertEquals(OUTPUT_DIR, DiagnosticsService.collectIssueDataPath(
+// ISSUE_NAME_APP_FAILED_NECESSARY_ARGS,
+// Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_APP_ID)));
+// }
+//
+// @Test(expected = IOException.class)
+// public void testCollectIssueDataPathInvalidOutputEmptyDir() throws Exception {
+// // invalid case: the script prints out an empty string as directory
+// // with the correct prefix
+// DiagnosticsService.collectIssueDataPath(ISSUE_NAME_APP_HANGING, null);
+// }
+//
+ @Test(expected = IOException.class)
+ public void testCollectIssueDataPathInvalidOutputMissingOutputDir()
+ throws Exception {
+ // invalid case: the script doesn't print out the correct output directory
+ DiagnosticsService.collectIssueDataPath(ISSUE_NAME_SCHED_ISSUE, null);
+ }
+//
+// @Test(expected = IOException.class)
+// public void testCollectIssueDataPathInvalidOutputMissingPrints()
+// throws Exception {
+// // invalid case: the script doesn't print out anything
+// DiagnosticsService.collectIssueDataPath(ISSUE_NAME_RM_NM_ISSUE, null);
+// }
+//
+ @Test(expected = IOException.class)
+ public void testCollectIssueDataPathScriptMissing() throws Exception {
+ DiagnosticsService.setScriptLocation("/src/invalidLocation/script.py");
+ DiagnosticsService.collectIssueDataPath(ISSUE_NAME_APP_DIAGNOSTIC, null);
+ }
+
+ @Test
+ public void testParseIssueTypeValidCases() {
+ // valid case: name, no parameters
+ String line = ISSUE_NAME_APP_FAILED;
+
+ assertIssueEquality(ISSUE_NAME_APP_FAILED, Collections.emptyList(),
+ DiagnosticsService.parseIssueType(line));
+
+ // valid case: name, one parameter
+ line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_ARG_APP_ID;
+
+ assertIssueEquality(ISSUE_NAME_APP_FAILED,
+ Collections.singletonList(ISSUE_ARG_APP_ID),
+ DiagnosticsService.parseIssueType(line));
+
+ // valid case: name, two parameters
+ line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_ARG_APP_ID +
+ COMMA + ISSUE_ARG_NODE_ID;
+
+ assertIssueEquality(ISSUE_NAME_APP_FAILED,
+ Arrays.asList(ISSUE_ARG_APP_ID, ISSUE_ARG_NODE_ID),
+ DiagnosticsService.parseIssueType(line));
+ }
+
+ @Test
+ public void testParseIssueTypeInvalidCases() {
+ // invalid case: too many values
+ String line = ISSUE_NAME_APP_FAILED + COLON + ISSUE_NAME_APP_FAILED +
+ COLON + ISSUE_NAME_APP_FAILED;
+
+ IssueType issueType = DiagnosticsService.parseIssueType(line);
+ Assert.assertNull(issueType);
+ }
+
+ private void assertIssueEquality(String expectedIssueName,
+ List expectedParams,
+ IssueType actualIssue) {
+ Assert.assertEquals(expectedIssueName,
+ actualIssue.getName());
+ Assert.assertEquals(expectedParams.size(),
+ actualIssue.getParameters().size());
+ Assert.assertTrue(CollectionUtils.isEqualCollection(
+ expectedParams, actualIssue.getParameters()));
+ }
+
+ private void handleWindowsRuntime() {
+ if (Shell.WINDOWS) {
+ try {
+ DiagnosticsService.listCommonIssues();
+ fail("On Windows listCommonIssues should throw " +
+ "UnsupportedOperationException");
+ } catch (Exception e) {
+ // Exception is expected
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py
new file mode 100644
index 0000000000000..42ced7311e36a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/diagnostics/diagnostics_collector_test.py
@@ -0,0 +1,316 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import argparse
+import sys, os
+import subprocess
+from datetime import datetime, timedelta
+from urllib import request, error
+import xml.etree.ElementTree as ET
+import re
+import time
+
+TEMP_DIR = "/tmp"
+HADOOP_CONF_DIR = "/etc/hadoop"
+YARN_SITE_XML = "yarn-site.xml"
+RM_ADDRESS_PROPERTY_NAME = "yarn.resourcemanager.webapp.address"
+
+RM_LOG_REGEX = r"(?<=\")\/logs.+?RESOURCEMANAGER.+?(?=\")"
+NM_LOG_REGEX = r"(?<=\")\/logs.+?NODEMANAGER.+?(?=\")"
+INPUT_TIME_FORMAT = '%a %b %d %H:%M:%S %Z %Y' # e.g. Wed May 28 07:35:39 UTC 2025
+OUTPUT_TIME_FORMAT = '%Y-%m-%d %H:%M:%S,%f' # e.g. 2025-05-28 11:57:05,435
+OUTPUT_TIME_FORMAT_WITHOUT_SECOND = '%Y-%m-%d %H:%M' # e.g. 2025-05-28 11:57
+NUMBER_OF_JSTACK = 3
+
+
+def application_diagnostic():
+ """
+ Application Logs, Application Info, Application Attempts
+ Multiple JStack of Hanging Containers and NodeManager
+ ResourceManager logs during job duration.
+ NodeManager logs from NodeManager where hanging containers of jobs run during the duration of containers.
+ """
+
+ if args.arguments is None or len(args.arguments) == 0:
+ print("Missing application or job id, exiting...")
+ sys.exit(os.EX_USAGE)
+
+ app_id = args.arguments[0]
+
+ output_path = create_output_dir(os.path.join(TEMP_DIR, app_id))
+
+ # Get JStack of the hanging containers
+ nm_address = get_nodemanager_address(app_id)
+ app_jstack = create_request("http://{}/ws/v1/node/apps/{}/jstack".format(nm_address, app_id), False)
+ write_output(output_path, "application_jstack", app_jstack)
+
+ # Get JStack of the hanging NodeManager
+ nm_jstack = create_request("http://{}/ws/v1/node/jstack".format(nm_address), False)
+ write_output(output_path, "nm_{}_jstack".format(nm_address), nm_jstack)
+
+ # Get application info
+ app_info= create_request("http://{}/ws/v1/cluster/apps/{}".format(RM_ADDRESS, app_id))
+ write_output(output_path, "application_info", app_info)
+
+ # Get application attempts
+ app_attempts = create_request("http://{}/ws/v1/cluster/apps/{}/appattempts".format(RM_ADDRESS, app_id))
+ write_output(output_path, "application_attempts", app_attempts)
+
+ # Get start_time and end_time of the application
+ start_time, end_time = get_application_time(app_info)
+
+ # Get RM log
+ log_address = get_node_log_address(RM_ADDRESS, RM_LOG_REGEX)
+ write_output(os.path.join(output_path, "node_log"), "resourcemanager_log",
+ filter_node_log(log_address, start_time, end_time))
+
+ # Get NodeManager logs in the duration of containers belonging to app_id
+ if "amHostHttpAddress" in app_info:
+ app_info = ET.fromstring(app_info)
+ nm_address = app_info.find("amHostHttpAddress").text
+ log_address = get_node_log_address(nm_address, NM_LOG_REGEX)
+ write_output(os.path.join(output_path, "node_log"), "nodemanager_log", get_container_log(log_address, app_id))
+
+ # Get application log
+ command = run_cmd_and_save_output(os.path.join(output_path, "app_logs"), app_id, "yarn", "logs", "-applicationId",
+ app_id) # TODO user permission?
+
+ command.communicate()
+ return output_path
+
+
+def scheduler_related_issue():
+ """
+ ResourceManager Scheduler Logs with DEBUG enabled for 2 minutes.
+ Multiple Jstack of ResourceManager
+ YARN and Scheduler Configuration
+ Cluster Scheduler API /ws/v1/cluster/scheduler and Cluster Nodes API /ws/v1/cluster/nodes response
+ Scheduler Activities /ws/v1/cluster/scheduler/bulk-activities response
+ """
+ output_path = create_output_dir(os.path.join(TEMP_DIR, "scheduler_related_issue" + str(time.time()).split(".")[0]))
+
+ # Multiple JStack of ResourceManager
+ rm_pids = get_resourcemanager_pid()
+ jstacks_output = get_multiple_jstack(rm_pids)
+ write_output(output_path, "jstacks_resourcemanager", jstacks_output)
+
+ # Get Cluster Scheduler Info
+ scheduler_info = create_request("http://{}/ws/v1/cluster/scheduler".format(RM_ADDRESS))
+ write_output(output_path, "scheduler_info", scheduler_info)
+
+ # Get Cluster Nodes Info
+ nodes_info = create_request("http://{}/ws/v1/cluster/nodes".format(RM_ADDRESS))
+ write_output(output_path, "nodemanager_info", nodes_info)
+
+ # Get Scheduler Activities
+ scheduler_activities = create_request("http://{}/ws/v1/cluster/scheduler/bulk-activities".format(RM_ADDRESS))
+ write_output(output_path, "scheduler_activities", scheduler_activities)
+
+ # Get Scheduler Configuration
+ scheduler_config = create_request("http://{}/ws/v1/cluster/scheduler-conf".format(RM_ADDRESS))
+ write_output(output_path, "scheduler_configuration", scheduler_config)
+
+ # Get YARN configuration yarn-site.xml
+ yarn_conf = run_command("cat", os.path.join(HADOOP_CONF_DIR, YARN_SITE_XML))
+ write_output(output_path, "yarn_site", yarn_conf)
+
+ # Get RM Debug log for the last 2 minutes
+ enable_debug_log = set_rm_scheduler_log_level("DEBUG")
+ print(enable_debug_log)
+ log_address = get_node_log_address(RM_ADDRESS, RM_LOG_REGEX)
+ start_time, end_time = (format_datetime_no_seconds(datetime.now() - timedelta(seconds=120)),
+ format_datetime_no_seconds(datetime.now()))
+ rm_debug_log = filter_node_log(log_address, start_time, end_time)
+ write_output(output_path, "rm_debug_log_2min", rm_debug_log)
+ enable_info_log = set_rm_scheduler_log_level("INFO")
+ print(enable_info_log)
+
+ return output_path
+
+####################################################### Utils Functions ###############################################
+
+
+def list_issues():
+ print("application_diagnostic:appId", "scheduler_related_issue", sep="\n")
+
+
+def parse_url_from_conf(conf_file, url_property_name):
+ root = ET.parse(os.path.join(HADOOP_CONF_DIR, conf_file))
+ for prop in root.findall("property"):
+ prop_name = prop.find("name").text
+ if prop_name == url_property_name:
+ return prop.find("value").text
+
+ return None
+
+
+def create_output_dir(dir_path):
+ if not os.path.exists(dir_path):
+ os.makedirs(dir_path)
+ return dir_path
+
+
+def write_output(output_path, out_filename, value):
+ output_path = create_output_dir(output_path)
+ with open(os.path.join(output_path, out_filename), 'w') as f:
+ f.write(value)
+
+
+def run_command(*argv):
+ try:
+ cmd = " ".join(arg for arg in argv)
+ print("Running command with arguments:", cmd)
+ response = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, check=True)
+ response_str = response.stdout.decode('utf-8')
+ except subprocess.CalledProcessError as e:
+ response_str = "Command failed with error: {}".format(e)
+ print("Unable to run command: ", response_str)
+ except Exception as e:
+ response_str = "Exception occurred: {}".format(e)
+ print("Exception occurred: ", response_str)
+
+ return response_str
+
+
+def run_cmd_and_save_output(output_path, out_filename, *argv):
+ file_path = os.path.join(create_output_dir(output_path), out_filename)
+ with open(file_path, 'w') as f:
+ return subprocess.Popen(argv, stdout=f)
+
+
+def create_request(url, xml_type=True):
+ headers = {}
+ # TODO auth can be handled here
+ if xml_type:
+ headers["Accept"] = "application/xml"
+
+ try:
+ req = request.Request(url, headers=headers)
+ response = request.urlopen(req)
+ response_str = response.read().decode('utf-8')
+ except error.HTTPError as e:
+ response_str = "HTTP error occurred: {} - {}".format(e.code, e.reason)
+ print("Request failed: ", response_str)
+ except Exception as e:
+ response_str = "Unexpected error: {}".format(e)
+ print("Request failed: {}".format(response_str))
+
+ return response_str
+
+
+def get_nodemanager_address(app_id):
+ app_info = create_request("http://{}/ws/v1/cluster/apps/{}".format(RM_ADDRESS, app_id))
+ app_info_xml = ET.fromstring(app_info)
+ return app_info_xml.find("amHostHttpAddress").text
+
+
+def get_node_log_address(node_address, link_regex):
+ try:
+ log_page = create_request("http://{}/logs/".format(node_address), False)
+ matches = re.findall(link_regex, log_page, re.MULTILINE)
+ if not matches:
+ return "Warning: No matching log links found at {}/logs/".format(node_address)
+ return node_address + matches[0]
+ except Exception as e:
+ return "Failed to retrieve node logs address from {}: {}".format(node_address, e)
+
+
+def filter_node_log(node_log_address: str, start_time: str, end_time: str):
+ return run_command("curl", "-s", "http://{}".format(node_log_address), "|", "sed", "-n",
+ "'/{}/,/{}/p'".format(start_time, end_time))
+
+
+def get_container_log(log_address, id):
+ return run_command("curl", "http://{}".format(log_address), "|", "grep", re.sub(r"^(job|application)", "container", id))
+
+
+def get_application_time(app_info_string):
+ app_element = ET.fromstring(app_info_string)
+ start_time_epoch = int(app_element.find("startedTime").text)
+ finish_time_epoch = int(app_element.find("finishedTime").text)
+
+ start_time_str = datetime.fromtimestamp(start_time_epoch / 1000).strftime(OUTPUT_TIME_FORMAT)[:-4] # -4, the time conversion is not accurrate
+ finish_time_str = datetime.fromtimestamp(finish_time_epoch / 1000).strftime(OUTPUT_TIME_FORMAT)[:-4]
+
+ return start_time_str, finish_time_str
+
+
+def get_resourcemanager_pid():
+ results = run_command("ps", "aux", "|", "grep", "resourcemanager", "|", "grep", "-v", "grep")
+
+ pids = []
+ for result in results.strip().splitlines():
+ pid = result.split()[1]
+ pids.append(pid)
+
+ return pids
+
+
+def get_multiple_jstack(pids):
+ all_jstacks = []
+
+ for pid in pids:
+ for i in range(NUMBER_OF_JSTACK): # Get multiple jstack
+ jstack_output = run_command("jstack", pid)
+ all_jstacks.append("--- JStack iteration-{} for PID: {} ---\n{}".format(i, pid, jstack_output))
+
+ return "\n".join(all_jstacks)
+
+
+def set_rm_scheduler_log_level(log_level):
+ return run_command("yarn", "daemonlog", "-setlevel", RM_ADDRESS,
+ "org.apache.hadoop.yarn.server.resourcemanager.scheduler", log_level)
+
+
+def format_datetime_no_seconds(datetime_obj):
+ return datetime_obj.strftime(OUTPUT_TIME_FORMAT_WITHOUT_SECOND)
+
+
+def main():
+
+ ISSUE_MAP = {
+ "application_diagnostic": application_diagnostic,
+ "scheduler_related_issue": scheduler_related_issue,
+ }
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-l", "--list", help="List the available issue types.", action="store_true")
+ parser.add_argument("-c", "--command", choices=list(ISSUE_MAP), help="Initiate the diagnostic information collecton"
+ "for diagnosing the selected issue type.")
+ parser.add_argument("-a", "--arguments", nargs='*', help="The required arguments for the selected issue type.")
+ global args
+ args = parser.parse_args()
+
+ if not (args.list or args.command):
+ parser.error('No action requested, use --list or --command')
+
+ if args.list:
+ list_issues()
+ sys.exit(os.EX_OK)
+
+ global RM_ADDRESS
+ RM_ADDRESS = parse_url_from_conf(YARN_SITE_XML, RM_ADDRESS_PROPERTY_NAME)
+ if RM_ADDRESS is None:
+ print("RM address can't be found, exiting...")
+ sys.exit(1)
+
+ selected_option = ISSUE_MAP[args.command]
+ print(selected_option()) # print the resulted output path that will be used by the DiagnosticsService.java
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java
index fd767953d16d0..bdd732d8964de 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/DefaultRequestInterceptorREST.java
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -49,6 +50,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
@@ -137,6 +139,22 @@ public ClusterMetricsInfo getClusterMetricsInfo() {
getConf(), client);
}
+ @Override
+ public CommonIssues getCommonIssueList() {
+ return RouterWebServiceUtil.genericForward(webAppAddress, null,
+ CommonIssues.class, HTTPMethods.GET,
+ RMWSConsts.RM_WEB_SERVICE_PATH + RMWSConsts.COMMON_ISSUE_LIST,
+ null, null, getConf(), client);
+ }
+
+ @Override
+ public Response getCommonIssueData(String issueId, List args) {
+ return RouterWebServiceUtil.genericForward(webAppAddress, null,
+ Response.class, HTTPMethods.GET,
+ RMWSConsts.RM_WEB_SERVICE_PATH + RMWSConsts.COMMON_ISSUE_COLLECT,
+ null, null, getConf(), client);
+ }
+
@Override
public SchedulerTypeInfo getSchedulerInfo() {
return RouterWebServiceUtil.genericForward(webAppAddress, null,
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java
index ce7baa3df69ea..a5f381d623e92 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java
@@ -102,6 +102,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NewApplication;
@@ -199,6 +200,7 @@
import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.SIGNAL_TOCONTAINER;
import static org.apache.hadoop.yarn.server.router.webapp.RouterWebServiceUtil.extractToken;
import static org.apache.hadoop.yarn.server.router.webapp.RouterWebServiceUtil.getKerberosUserGroupInformation;
+import org.apache.commons.lang3.NotImplementedException;
/**
* Extends the {@code AbstractRESTRequestInterceptor} class and provides an
@@ -1141,6 +1143,16 @@ public AppState getAppState(HttpServletRequest hsr, String appId)
return new AppState();
}
+ @Override
+ public CommonIssues getCommonIssueList() {
+ throw new NotImplementedException("Code is not implemented");
+ }
+
+ @Override
+ public Response getCommonIssueData(String issueId, List args) {
+ throw new NotImplementedException("Code is not implemented");
+ }
+
@Override
public ClusterInfo get() {
return getClusterInfo();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java
index 1266598ff0e27..9323514429618 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServices.java
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.util.Collections;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -67,6 +68,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
@@ -300,6 +302,30 @@ public ClusterMetricsInfo getClusterMetricsInfo() {
return pipeline.getRootInterceptor().getClusterMetricsInfo();
}
+ @GET
+ @Path(RMWSConsts.COMMON_ISSUE_LIST)
+ @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
+ MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 })
+ @Override
+ public CommonIssues getCommonIssueList() {
+ init();
+ RequestInterceptorChainWrapper pipeline = getInterceptorChain(null);
+ return pipeline.getRootInterceptor().getCommonIssueList();
+ }
+
+ @GET
+ @Path(RMWSConsts.COMMON_ISSUE_COLLECT)
+ @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
+ MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 })
+ @Override
+ public Response getCommonIssueData(
+ @QueryParam(RMWSConsts.ISSUEID) String issueId,
+ @QueryParam(RMWSConsts.ISSUEARGS) List args) {
+ init();
+ RequestInterceptorChainWrapper pipeline = getInterceptorChain(null);
+ return pipeline.getRootInterceptor().getCommonIssueData(issueId, args);
+ }
+
@GET
@Path(RMWSConsts.SCHEDULER)
@Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8,
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java
index a09199b9e856b..bbab51b06bfbd 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockRESTRequestInterceptor.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.router.webapp;
import java.io.IOException;
+import java.util.List;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
@@ -44,6 +45,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
@@ -97,6 +99,16 @@ public ClusterMetricsInfo getClusterMetricsInfo() {
return new ClusterMetricsInfo();
}
+ @Override // TODO these may need to be edited for testing purposes
+ public CommonIssues getCommonIssueList() {
+ return new CommonIssues();
+ }
+
+ @Override // TODO these may need to be edited for testing purposes
+ public Response getCommonIssueData(String issueId, List args) {
+ return Response.ok().build();
+ }
+
@Override
public SchedulerTypeInfo getSchedulerInfo() {
return new SchedulerTypeInfo();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java
index 1bffd40db3c19..663f720a35d90 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/PassThroughRESTRequestInterceptor.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.router.webapp;
import java.io.IOException;
+import java.util.List;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
@@ -42,6 +43,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterUserInfo;
+import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CommonIssues;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.DelegationToken;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LabelsToNodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
@@ -122,6 +124,16 @@ public ClusterMetricsInfo getClusterMetricsInfo() {
return getNextInterceptor().getClusterMetricsInfo();
}
+ @Override
+ public CommonIssues getCommonIssueList() {
+ return getNextInterceptor().getCommonIssueList();
+ }
+
+ @Override
+ public Response getCommonIssueData(String issueId, List args) {
+ return getNextInterceptor().getCommonIssueData(issueId, args);
+ }
+
@Override
public SchedulerTypeInfo getSchedulerInfo() {
return getNextInterceptor().getSchedulerInfo();