-
Notifications
You must be signed in to change notification settings - Fork 24
WIP - ES Troubleshooting #1020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 2.4/dev
Are you sure you want to change the base?
WIP - ES Troubleshooting #1020
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,6 +58,10 @@ routes.push({ path: '/grid', name: 'grid', component: { | |
| gridMemberTestConfirmDialog: false, | ||
| gridMemberRestartConfirmDialog: false, | ||
| gridMemberUploadConfirmDialog: false, | ||
| troubleshootDialog: false, | ||
| troubleshootData: null, | ||
| troubleshootError: null, | ||
| troubleshootLoading: false, | ||
| uploadForm: { valid: true, attachment: null }, | ||
| maxUploadSizeBytes: 25 * 1024 * 1024, | ||
| staleMetricsMs: 120000, | ||
|
|
@@ -360,6 +364,46 @@ routes.push({ path: '/grid', name: 'grid', component: { | |
| this.$root.stopLoading(); | ||
| this.hideRestartConfirm(); | ||
| }, | ||
| canTroubleshoot(node) { | ||
| // Manager-type nodes only | ||
| return ['so-manager', 'so-managersearch', 'so-standalone'].indexOf(node.role) != -1; | ||
| }, | ||
| showTroubleshootDialog(node) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These new JS functions need unit tests. |
||
| this.selectedNode = node; | ||
| this.troubleshootData = null; | ||
| this.troubleshootError = null; | ||
| this.troubleshootDialog = true; | ||
| this.runTroubleshoot(); | ||
| }, | ||
| hideTroubleshootDialog() { | ||
| this.troubleshootDialog = false; | ||
| this.troubleshootData = null; | ||
| this.troubleshootError = null; | ||
| this.selectedNode = null; | ||
| }, | ||
| async runTroubleshoot() { | ||
| const nodeId = this.getNodeName(this.selectedNode); | ||
| this.troubleshootLoading = true; | ||
| this.troubleshootError = null; | ||
| try { | ||
| const response = await this.$root.papi.post('gridmembers/' + nodeId + "/estroubleshoot", null, { | ||
| params: {gridId: this.selectedNode.gridId} | ||
| }); | ||
| try { | ||
| this.troubleshootData = JSON.parse(response.data.output); | ||
| } catch (parseError) { | ||
| this.troubleshootError = 'Failed to parse troubleshoot output'; | ||
| } | ||
| } catch (error) { | ||
| this.troubleshootError = error.message || error; | ||
| } | ||
| this.troubleshootLoading = false; | ||
| }, | ||
| getStatusColor(status) { | ||
| if (status === 'green' || status === 'ok') return 'success'; | ||
| if (status === 'yellow' || status === 'high') return 'warning'; | ||
| return 'error'; | ||
| }, | ||
| hasContainer(item, container) { | ||
| return item && item.containers && item.containers.find(function(x) { | ||
| return x.Name == container; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,6 +213,7 @@ <h4 v-if="metricsEnabled" data-aid="grid_summary">{{ i18n.gridEps }} {{ formatCo | |
| <span class="filter label">{{ i18n.eventstoreStatus }}:</span> | ||
| <span id="node_eventstoreStatus" :class="areMetricsCurrent(item) ? 'filter value text-' + colorNodeStatus(item.eventstoreStatus) : 'filter value stale'"> | ||
| {{ $root.localizeMessage(item.eventstoreStatus) }} | ||
| <v-icon v-if="canTroubleshoot(item)" style="font-size: 10px; vertical-align: middle;" :title="i18n.troubleshootElasticsearch" @click="showTroubleshootDialog(item)" class="theme-icon cursor-pointer ml-1" data-aid="grid_node_troubleshoot">fa-circle-info</v-icon> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I try to avoid inline styles. Was the icon too big by default? If so, the vuetify v-icon component should have a size property. |
||
| </span> | ||
| </div> | ||
| <div v-if="item.metricsEnabled" class="text-no-wrap"> | ||
|
|
@@ -529,4 +530,114 @@ <h4 v-if="metricsEnabled" data-aid="grid_summary">{{ i18n.gridEps }} {{ formatCo | |
| </v-card-actions> | ||
| </v-card> | ||
| </v-dialog> | ||
| <v-dialog v-model="troubleshootDialog" persistent max-width="900" scrollable data-aid="grid_troubleshoot_dialog"> | ||
| <v-card> | ||
| <v-card-title class="text-h5 d-flex align-center"> | ||
| {{ i18n.troubleshootElasticsearch }} | ||
| <v-spacer></v-spacer> | ||
| <v-btn icon variant="text" @click="hideTroubleshootDialog" data-aid="grid_troubleshoot_close"> | ||
| <v-icon>fa-times</v-icon> | ||
| </v-btn> | ||
| </v-card-title> | ||
| <v-card-text> | ||
| <div v-if="troubleshootLoading" class="text-center py-4"> | ||
| <v-progress-circular indeterminate color="primary"></v-progress-circular> | ||
| <p class="mt-2">{{ i18n.runningTroubleshoot }}</p> | ||
| </div> | ||
| <v-alert v-else-if="troubleshootError" type="error" class="mb-4">{{ troubleshootError }}</v-alert> | ||
| <div v-else-if="troubleshootData"> | ||
| <!-- Elasticsearch Not Accessible --> | ||
| <v-alert v-if="!troubleshootData.elasticsearchStatus?.accessible" type="error" class="mb-4"> | ||
| {{ troubleshootData.elasticsearchStatus?.errorMessage || 'Elasticsearch is not accessible' }} | ||
| </v-alert> | ||
| <div v-else> | ||
| <!-- Issues Section - Show problems first --> | ||
| <div v-if="troubleshootData.elasticsearchStatus?.healthReport?.nonGreenCount > 0 && troubleshootData.elasticsearchStatus?.healthReport?.indicators?.length"> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These inline scripts are getting complex. Would be cleaner and testable to move them to grid.js functions. Ex: |
||
| <div class="text-subtitle-1 font-weight-bold mb-3">Issues Found</div> | ||
| <v-alert v-for="(indicator, issueIdx) in (troubleshootData?.elasticsearchStatus?.healthReport?.indicators || []).filter(i => i && i.status !== 'green' && !['repository_integrity', 'slm'].includes(i.name))" | ||
| :key="issueIdx" | ||
| :type="indicator?.status === 'red' ? 'error' : 'warning'" | ||
| variant="tonal" | ||
| class="mb-3"> | ||
| <div class="mb-2">{{ indicator?.symptom }}</div> | ||
| <div v-for="(diag, diagIdx) in (indicator?.diagnosis || [])" :key="diagIdx"> | ||
| <template v-if="diag"> | ||
| <p class="mb-2 text-caption">{{ diag.cause }}</p> | ||
| <p v-if="diag.affectedNodes?.length > 0" class="mb-2 text-caption"> | ||
| Affected node{{ diag.affectedNodes.length > 1 ? 's' : '' }}: {{ diag.affectedNodes.length <= 5 ? diag.affectedNodes.join(', ') : diag.affectedNodes.slice(0, 5).join(', ') + ' and ' + (diag.affectedNodes.length - 5) + ' more' }} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like it could use some help.
Several other English phrases in this PR also need localized. |
||
| </p> | ||
| <p class="mb-2 text-caption">Recommendation: {{ diag.action }}</p> | ||
| <v-expansion-panels v-if="diag.affectedIndices?.length > 0" flat> | ||
| <v-expansion-panel> | ||
| <v-expansion-panel-title class="py-0 text-caption"> | ||
| {{ diag.affectedIndices.length }} affected indices | ||
| </v-expansion-panel-title> | ||
| <v-expansion-panel-text> | ||
| <div style="max-height: 150px; overflow-y: auto;" class="text-caption"> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another example where you should try to stick to vuetify style classes instead of inlining styles. Matthew can assist here if this is going to be too time consuming to get ramped up on. |
||
| <div v-for="(index, indexIdx) in (diag.affectedIndices || []).slice(0, 50)" :key="indexIdx">{{ index }}</div> | ||
| <div v-if="diag.affectedIndices?.length > 50" class="font-italic">...and {{ diag.affectedIndices.length - 50 }} more</div> | ||
| </div> | ||
| </v-expansion-panel-text> | ||
| </v-expansion-panel> | ||
| </v-expansion-panels> | ||
| </template> | ||
| </div> | ||
| </v-alert> | ||
| </div> | ||
|
|
||
| <!-- Disk Usage Table --> | ||
| <div class="text-subtitle-1 font-weight-bold mb-2">{{ i18n.diskUsage }}</div> | ||
| <div v-if="troubleshootData.diskUsage?.nodes?.length" style="max-height: 300px; overflow-y: auto;"> | ||
| <v-table density="compact"> | ||
| <template #default> | ||
| <thead> | ||
| <tr> | ||
| <th>Node</th> | ||
| <th>IP</th> | ||
| <th>Disk Used</th> | ||
| <th>Available</th> | ||
| <th>Status</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <template v-for="(node, nodeIdx) in (troubleshootData?.diskUsage?.nodes || [])" :key="nodeIdx"> | ||
| <tr v-if="node"> | ||
| <td>{{ node.name }}</td> | ||
| <td>{{ node.ip }}</td> | ||
| <td>{{ node.usedPercent != null ? node.usedPercent.toFixed(1) + '%' : '—' }}</td> | ||
| <td>{{ node.availableBytes }}</td> | ||
| <td><v-chip :color="getStatusColor(node.status)" size="x-small">{{ node.status }}</v-chip></td> | ||
| </tr> | ||
| </template> | ||
| </tbody> | ||
| </template> | ||
| </v-table> | ||
| </div> | ||
| <div class="text-caption text--secondary mb-4"> | ||
| Elasticsearch watermarks — | ||
| writes blocked at <strong>{{ troubleshootData.diskUsage?.watermarks?.floodDisplay }}</strong> (flood), | ||
| rebalancing starts at <strong>{{ troubleshootData.diskUsage?.watermarks?.highDisplay }}</strong> (high), | ||
| new shards blocked at <strong>{{ troubleshootData.diskUsage?.watermarks?.lowDisplay }}</strong> (low) | ||
| </div> | ||
|
|
||
| <!-- All Checks Summary --> | ||
| <div v-if="troubleshootData.elasticsearchStatus?.healthReport?.indicators?.length" class="mt-4"> | ||
| <div class="text-subtitle-1 font-weight-bold mb-2">All Checks</div> | ||
| <div class="d-flex flex-wrap ga-2"> | ||
| <v-chip v-for="(indicator, indicatorIdx) in (troubleshootData?.elasticsearchStatus?.healthReport?.indicators || []).filter(i => i && !['repository_integrity', 'slm'].includes(i.name))" :key="indicatorIdx" | ||
| :color="getStatusColor(indicator?.status)" variant="flat" size="small"> | ||
| {{ indicator?.displayName }} | ||
| </v-chip> | ||
| </div> | ||
| </div> | ||
| </div> | ||
| </div> | ||
| </v-card-text> | ||
| <v-card-actions> | ||
| <v-spacer></v-spacer> | ||
| <v-btn text @click="runTroubleshoot" :disabled="troubleshootLoading" data-aid="grid_troubleshoot_refresh">{{ i18n.refresh }}</v-btn> | ||
| <v-btn text @click="hideTroubleshootDialog" data-aid="grid_troubleshoot_close_btn">{{ i18n.close }}</v-btn> | ||
| </v-card-actions> | ||
| </v-card> | ||
| </v-dialog> | ||
| </v-container> | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -275,7 +275,7 @@ func (h *GridMembersHandler) postImport(w http.ResponseWriter, r *http.Request) | |
| // @Tags Grid | ||
| // @Security bearer[grid/write] | ||
| // @Param id path string true "The grid member ID to be managed" example(so_standalone) | ||
| // @Param operation path string true "The operation to perform: add, reject, delete, test, restart" example(reject) | ||
| // @Param operation path string true "The operation to perform: add, reject, delete, test, restart, estroubleshoot" example(reject) | ||
| // @Success 200 "The operation was executed successfully" | ||
| // @Failure 401 "Request was not properly authenticated" | ||
| // @Failure 403 "Insufficient permissions for this request" | ||
|
|
@@ -292,11 +292,26 @@ func (h *GridMembersHandler) postManageMembers(w http.ResponseWriter, r *http.Re | |
| } | ||
|
|
||
| op := chi.URLParam(r, "operation") | ||
| if op != "add" && op != "reject" && op != "delete" && op != "test" && op != "restart" { | ||
| if op != "add" && op != "reject" && op != "delete" && op != "test" && op != "restart" && op != "estroubleshoot" { | ||
| web.Respond(w, r, http.StatusBadRequest, errors.New("Invalid operation")) | ||
| return | ||
| } | ||
|
|
||
| // Handle Elasticsearch troubleshoot operation separately since it returns output | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently you have a mixture of reusing the manage member handler but then branching out to a new gridMemberStore function. There are two ways to handle this:
|
||
| if op == "estroubleshoot" { | ||
| // Extract node name from minion ID (e.g., "manager_standalone" -> "manager") | ||
| parts := strings.SplitN(id, "_", 2) | ||
| nodeName := parts[0] | ||
|
|
||
| output, err := h.server.GridMembersstore.RunTroubleshoot(ctx, nodeName, "/usr/sbin/so-elasticsearch-troubleshoot") | ||
| if err != nil { | ||
| web.Respond(w, r, http.StatusInternalServerError, err) | ||
| return | ||
| } | ||
| web.Respond(w, r, http.StatusOK, map[string]string{"output": output}) | ||
| return | ||
| } | ||
|
|
||
| err := h.server.GridMembersstore.ManageMember(ctx, op, id) | ||
| if err != nil { | ||
| web.Respond(w, r, http.StatusBadRequest, err) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ type GridMembersstore interface { | |
| ManageMember(ctx context.Context, operation string, id string) error | ||
| SendFile(ctx context.Context, node string, from string, to string, cleanup bool) error | ||
| Import(ctx context.Context, node string, file string, importer string) (*string, error) | ||
| RunTroubleshoot(ctx context.Context, node string, script string) (string, error) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would go away in favor us either reusing the existing |
||
| } | ||
|
|
||
| //go:generate mockgen -destination mock/mock_gridmembersstore.go -package mock . GridMembersstore | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1164,6 +1164,28 @@ func (store *Saltstore) Import(ctx context.Context, node string, file string, im | |
| return &output, err | ||
| } | ||
|
|
||
| func (store *Saltstore) RunTroubleshoot(ctx context.Context, node string, script string) (string, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider renaming this to Diagnose() and pass in an enum like But first read the other comments below. |
||
| if err := store.server.CheckAuthorized(ctx, "read", "grid"); err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| args := map[string]string{ | ||
| "command": "run-troubleshoot", | ||
| "node": node, | ||
| "script": script + " --json", | ||
| } | ||
|
|
||
| // Use long timeout since troubleshoot scripts may take up to 120 seconds | ||
| ctxTimeout := options.WithTimeoutMs(ctx, store.longRelayTimeoutMs) | ||
|
|
||
| output, err := store.execCommand(ctxTimeout, args) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| return output, nil | ||
| } | ||
|
|
||
| func (store *Saltstore) lookupEmailFromId(ctx context.Context, id string) string { | ||
| user, _ := store.server.Userstore.GetUserById(ctx, id) | ||
| if user != nil && user.Id == id { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would think there would be more node types here, like eval, import, etc.