diff --git a/html/js/i18n.js b/html/js/i18n.js
index d2fc39bd5..111478479 100644
--- a/html/js/i18n.js
+++ b/html/js/i18n.js
@@ -471,6 +471,7 @@ const i18n = {
disabled: 'Disabled',
disabledFailedSync: 'The detection was saved but synchronization failed. The detection has been disabled. Check SOC logs for details.',
disconnected: 'Disconnected from manager',
+ diskUsage: 'Disk Usage',
diskUsageElastic: 'Elastic Storage Used',
diskUsageInfluxDb: 'InfluxDB Storage Used',
diskUsageNsm: 'NSM Partition Usage',
@@ -924,6 +925,7 @@ const i18n = {
rulePassBadChars: 'The password must not contain the following characters: " \' $ & !',
rules: 'Rules',
ruleset: 'Ruleset',
+ runningTroubleshoot: 'Running troubleshoot script...',
save: 'Save',
saveSuccess: 'Save successful!',
searchUsername: 'Search User',
@@ -1147,6 +1149,7 @@ const i18n = {
trafficManOut: 'Outbound Mgmt Traffic',
trafficManOutAbbr: 'Mgmt Out',
transcriptCyberChefHelp: 'Send the transcript to CyberChef',
+ troubleshootElasticsearch: 'Troubleshoot Elasticsearch',
ttr: 'Time Tracking',
tuneDetection: 'Tune Detection',
tuneDetectionHelp: 'Tune the detection that triggered this alert',
diff --git a/html/js/routes/grid.js b/html/js/routes/grid.js
index 66a0582d1..97d20a5e7 100644
--- a/html/js/routes/grid.js
+++ b/html/js/routes/grid.js
@@ -58,6 +58,10 @@ routes.push({ path: '/grid', name: 'grid', component: {
gridMemberTestConfirmDialog: false,
gridMemberRestartConfirmDialog: false,
gridMemberUploadConfirmDialog: false,
+ troubleshootDialog: false,
+ troubleshootData: null,
+ troubleshootError: null,
+ troubleshootLoading: false,
uploadForm: { valid: true, attachment: null },
maxUploadSizeBytes: 25 * 1024 * 1024,
staleMetricsMs: 120000,
@@ -360,6 +364,46 @@ routes.push({ path: '/grid', name: 'grid', component: {
this.$root.stopLoading();
this.hideRestartConfirm();
},
+ canTroubleshoot(node) {
+ // Manager-type nodes only
+ return ['so-manager', 'so-managersearch', 'so-standalone'].indexOf(node.role) != -1;
+ },
+ showTroubleshootDialog(node) {
+ this.selectedNode = node;
+ this.troubleshootData = null;
+ this.troubleshootError = null;
+ this.troubleshootDialog = true;
+ this.runTroubleshoot();
+ },
+ hideTroubleshootDialog() {
+ this.troubleshootDialog = false;
+ this.troubleshootData = null;
+ this.troubleshootError = null;
+ this.selectedNode = null;
+ },
+ async runTroubleshoot() {
+ const nodeId = this.getNodeName(this.selectedNode);
+ this.troubleshootLoading = true;
+ this.troubleshootError = null;
+ try {
+ const response = await this.$root.papi.post('gridmembers/' + nodeId + "/estroubleshoot", null, {
+ params: {gridId: this.selectedNode.gridId}
+ });
+ try {
+ this.troubleshootData = JSON.parse(response.data.output);
+ } catch (parseError) {
+ this.troubleshootError = 'Failed to parse troubleshoot output';
+ }
+ } catch (error) {
+ this.troubleshootError = error.message || error;
+ }
+ this.troubleshootLoading = false;
+ },
+ getStatusColor(status) {
+ if (status === 'green' || status === 'ok') return 'success';
+ if (status === 'yellow' || status === 'high') return 'warning';
+ return 'error';
+ },
hasContainer(item, container) {
return item && item.containers && item.containers.find(function(x) {
return x.Name == container;
diff --git a/html/pages/grid.html b/html/pages/grid.html
index 631266c4f..9a20d317b 100644
--- a/html/pages/grid.html
+++ b/html/pages/grid.html
@@ -213,6 +213,7 @@
{{ i18n.gridEps }} {{ formatCo
{{ i18n.eventstoreStatus }}:
{{ $root.localizeMessage(item.eventstoreStatus) }}
+ fa-circle-info
@@ -529,4 +530,114 @@
{{ i18n.gridEps }} {{ formatCo
+
+
+
+ {{ i18n.troubleshootElasticsearch }}
+
+
+ fa-times
+
+
+
+
+
+
{{ i18n.runningTroubleshoot }}
+
+ {{ troubleshootError }}
+
+
+
+ {{ troubleshootData.elasticsearchStatus?.errorMessage || 'Elasticsearch is not accessible' }}
+
+
+
+
+
Issues Found
+
+ {{ indicator?.symptom }}
+
+
+ {{ diag.cause }}
+
+ Affected node{{ diag.affectedNodes.length > 1 ? 's' : '' }}: {{ diag.affectedNodes.length <= 5 ? diag.affectedNodes.join(', ') : diag.affectedNodes.slice(0, 5).join(', ') + ' and ' + (diag.affectedNodes.length - 5) + ' more' }}
+
+ Recommendation: {{ diag.action }}
+
+
+
+ {{ diag.affectedIndices.length }} affected indices
+
+
+
+
{{ index }}
+
...and {{ diag.affectedIndices.length - 50 }} more
+
+
+
+
+
+
+
+
+
+
+
{{ i18n.diskUsage }}
+
+
+
+
+
+ | Node |
+ IP |
+ Disk Used |
+ Available |
+ Status |
+
+
+
+
+
+ | {{ node.name }} |
+ {{ node.ip }} |
+ {{ node.usedPercent != null ? node.usedPercent.toFixed(1) + '%' : '—' }} |
+ {{ node.availableBytes }} |
+ {{ node.status }} |
+
+
+
+
+
+
+
+ Elasticsearch watermarks —
+ writes blocked at {{ troubleshootData.diskUsage?.watermarks?.floodDisplay }} (flood),
+ rebalancing starts at {{ troubleshootData.diskUsage?.watermarks?.highDisplay }} (high),
+ new shards blocked at {{ troubleshootData.diskUsage?.watermarks?.lowDisplay }} (low)
+
+
+
+
+
All Checks
+
+
+ {{ indicator?.displayName }}
+
+
+
+
+
+
+
+
+ {{ i18n.refresh }}
+ {{ i18n.close }}
+
+
+
diff --git a/server/gridmembershandler.go b/server/gridmembershandler.go
index 44928ee14..f40570ba4 100644
--- a/server/gridmembershandler.go
+++ b/server/gridmembershandler.go
@@ -275,7 +275,7 @@ func (h *GridMembersHandler) postImport(w http.ResponseWriter, r *http.Request)
// @Tags Grid
// @Security bearer[grid/write]
// @Param id path string true "The grid member ID to be managed" example(so_standalone)
-// @Param operation path string true "The operation to perform: add, reject, delete, test, restart" example(reject)
+// @Param operation path string true "The operation to perform: add, reject, delete, test, restart, estroubleshoot" example(reject)
// @Success 200 "The operation was executed successfully"
// @Failure 401 "Request was not properly authenticated"
// @Failure 403 "Insufficient permissions for this request"
@@ -292,11 +292,26 @@ func (h *GridMembersHandler) postManageMembers(w http.ResponseWriter, r *http.Re
}
op := chi.URLParam(r, "operation")
- if op != "add" && op != "reject" && op != "delete" && op != "test" && op != "restart" {
+ if op != "add" && op != "reject" && op != "delete" && op != "test" && op != "restart" && op != "estroubleshoot" {
web.Respond(w, r, http.StatusBadRequest, errors.New("Invalid operation"))
return
}
+ // Handle Elasticsearch troubleshoot operation separately since it returns output
+ if op == "estroubleshoot" {
+ // Extract node name from minion ID (e.g., "manager_standalone" -> "manager")
+ parts := strings.SplitN(id, "_", 2)
+ nodeName := parts[0]
+
+ output, err := h.server.GridMembersstore.RunTroubleshoot(ctx, nodeName, "/usr/sbin/so-elasticsearch-troubleshoot")
+ if err != nil {
+ web.Respond(w, r, http.StatusInternalServerError, err)
+ return
+ }
+ web.Respond(w, r, http.StatusOK, map[string]string{"output": output})
+ return
+ }
+
err := h.server.GridMembersstore.ManageMember(ctx, op, id)
if err != nil {
web.Respond(w, r, http.StatusBadRequest, err)
diff --git a/server/gridmembersstore.go b/server/gridmembersstore.go
index 5e7136130..1ea4b3c73 100644
--- a/server/gridmembersstore.go
+++ b/server/gridmembersstore.go
@@ -17,6 +17,7 @@ type GridMembersstore interface {
ManageMember(ctx context.Context, operation string, id string) error
SendFile(ctx context.Context, node string, from string, to string, cleanup bool) error
Import(ctx context.Context, node string, file string, importer string) (*string, error)
+ RunTroubleshoot(ctx context.Context, node string, script string) (string, error)
}
//go:generate mockgen -destination mock/mock_gridmembersstore.go -package mock . GridMembersstore
diff --git a/server/mock/mock_gridmembersstore.go b/server/mock/mock_gridmembersstore.go
index 115a8e1cd..46c5185f9 100644
--- a/server/mock/mock_gridmembersstore.go
+++ b/server/mock/mock_gridmembersstore.go
@@ -85,6 +85,21 @@ func (mr *MockGridMembersstoreMockRecorder) ManageMember(ctx, operation, id any)
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ManageMember", reflect.TypeOf((*MockGridMembersstore)(nil).ManageMember), ctx, operation, id)
}
+// RunTroubleshoot mocks base method.
+func (m *MockGridMembersstore) RunTroubleshoot(ctx context.Context, node, script string) (string, error) {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "RunTroubleshoot", ctx, node, script)
+ ret0, _ := ret[0].(string)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
+}
+
+// RunTroubleshoot indicates an expected call of RunTroubleshoot.
+func (mr *MockGridMembersstoreMockRecorder) RunTroubleshoot(ctx, node, script any) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RunTroubleshoot", reflect.TypeOf((*MockGridMembersstore)(nil).RunTroubleshoot), ctx, node, script)
+}
+
// SendFile mocks base method.
func (m *MockGridMembersstore) SendFile(ctx context.Context, node, from, to string, cleanup bool) error {
m.ctrl.T.Helper()
diff --git a/server/modules/salt/saltstore.go b/server/modules/salt/saltstore.go
index 662dc114a..e303717ad 100644
--- a/server/modules/salt/saltstore.go
+++ b/server/modules/salt/saltstore.go
@@ -1164,6 +1164,28 @@ func (store *Saltstore) Import(ctx context.Context, node string, file string, im
return &output, err
}
+func (store *Saltstore) RunTroubleshoot(ctx context.Context, node string, script string) (string, error) {
+ if err := store.server.CheckAuthorized(ctx, "read", "grid"); err != nil {
+ return "", err
+ }
+
+ args := map[string]string{
+ "command": "run-troubleshoot",
+ "node": node,
+ "script": script + " --json",
+ }
+
+ // Use long timeout since troubleshoot scripts may take up to 120 seconds
+ ctxTimeout := options.WithTimeoutMs(ctx, store.longRelayTimeoutMs)
+
+ output, err := store.execCommand(ctxTimeout, args)
+ if err != nil {
+ return "", err
+ }
+
+ return output, nil
+}
+
func (store *Saltstore) lookupEmailFromId(ctx context.Context, id string) string {
user, _ := store.server.Userstore.GetUserById(ctx, id)
if user != nil && user.Id == id {