diff --git a/vclusterops/https_poll_node_state_op.go b/vclusterops/https_poll_node_state_op.go index 4a89966..a8dcb7c 100644 --- a/vclusterops/https_poll_node_state_op.go +++ b/vclusterops/https_poll_node_state_op.go @@ -100,7 +100,8 @@ func makeHTTPSPollNodeStateOp(logger vlog.Printer, hosts []string, } func (op *httpsPollNodeStateOp) getPollingTimeout() int { - return op.timeout + // a negative value indicates no timeout and should never be used for this op + return util.Max(op.timeout, 0) } func (op *httpsPollNodeStateOp) setupClusterHTTPRequest(hosts []string) error { diff --git a/vclusterops/https_poll_subscription_state_op.go b/vclusterops/https_poll_subscription_state_op.go index 16bcb64..c040475 100644 --- a/vclusterops/https_poll_subscription_state_op.go +++ b/vclusterops/https_poll_subscription_state_op.go @@ -48,7 +48,8 @@ func makeHTTPSPollSubscriptionStateOp(logger vlog.Printer, hosts []string, } func (op *httpsPollSubscriptionStateOp) getPollingTimeout() int { - return op.timeout + // a negative value indicates no timeout and should never be used for this op + return util.Max(op.timeout, 0) } func (op *httpsPollSubscriptionStateOp) setupClusterHTTPRequest(hosts []string) error { diff --git a/vclusterops/https_re_ip_op.go b/vclusterops/https_re_ip_op.go index b74b8da..62fb3eb 100644 --- a/vclusterops/https_re_ip_op.go +++ b/vclusterops/https_re_ip_op.go @@ -26,7 +26,7 @@ type httpsReIPOp struct { opBase opHTTPSBase hostToReIP []string - reIPList map[string]reIPInfo + reIPList map[string]ReIPInfo nodeNamesList []string upHosts []string } @@ -77,7 +77,7 @@ func (op *httpsReIPOp) setupClusterHTTPRequest(hosts []string) error { } func (op *httpsReIPOp) prepare(execContext *opEngineExecContext) error { - op.reIPList = make(map[string]reIPInfo) + op.reIPList = make(map[string]ReIPInfo) // update reIPList from input node names and execContext.networkProfiles for i := 0; i < len(op.nodeNamesList); i++ { nodeNameToReIP := op.nodeNamesList[i] @@ -86,7 +86,7 @@ func (op *httpsReIPOp) prepare(execContext *opEngineExecContext) error { if !ok { return fmt.Errorf("[%s] unable to find network profile for address %s", op.name, targetAddress) } - info := reIPInfo{ + info := ReIPInfo{ NodeName: nodeNameToReIP, TargetAddress: targetAddress, TargetControlAddress: profile.Address, diff --git a/vclusterops/nma_check_system_tables_op.go b/vclusterops/nma_check_system_tables_op.go new file mode 100644 index 0000000..232799b --- /dev/null +++ b/vclusterops/nma_check_system_tables_op.go @@ -0,0 +1,124 @@ +/* + (c) Copyright [2023] Open Text. + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package vclusterops + +import ( + "errors" + "fmt" + + "github.com/vertica/vcluster/vclusterops/vlog" +) + +type nmaCheckSystemTablesOp struct { + scrutinizeOpBase +} + +type checkSystemTablesResponseData struct { + Name string `json:"name"` +} + +func makeNMACheckSystemTablesOp(logger vlog.Printer, + id string, + hostNodeNameMap map[string]string) nmaCheckSystemTablesOp { + // base members + op := nmaCheckSystemTablesOp{} + op.name = "NMACheckSystemTablesOp" + op.logger = logger.WithName(op.name) + + // scrutinize members + op.id = id + op.batch = scrutinizeBatchSystemTables + op.hostNodeNameMap = hostNodeNameMap + op.httpMethod = GetMethod + op.urlSuffix = "/vs-status" + + return op +} + +func (op *nmaCheckSystemTablesOp) getPollingTimeout() int { + return ScrutinizePollingTimeout +} + +func (op *nmaCheckSystemTablesOp) prepare(execContext *opEngineExecContext) error { + // only run this task on a single up node + // if there are no up nodes, skip this task + if len(execContext.upHosts) == 0 { + op.logger.PrintWarning("no up hosts to collect system tables from, skipping the operation") + op.skipExecute = true + return nil + } + host := getInitiator(execContext.upHosts) + + // construct host list for interface purposes + op.hosts = []string{host} + err := validateHostMaps(op.hosts, op.hostNodeNameMap) + if err != nil { + return err + } + + // prepare GET request with no params or body + execContext.dispatcher.setup(op.hosts) + return op.setupClusterHTTPRequest(op.hosts) +} + +func (op *nmaCheckSystemTablesOp) execute(execContext *opEngineExecContext) error { + if err := op.runExecute(execContext); err != nil { + return err + } + + return op.processResult(execContext) +} + +func (op *nmaCheckSystemTablesOp) finalize(_ *opEngineExecContext) error { + return nil +} + +func (op *nmaCheckSystemTablesOp) processResult(execContext *opEngineExecContext) error { + // wait until initiator has finished gathering system tables + err := pollState(op, execContext) + if err != nil { + return fmt.Errorf("failed to stage system tables, %w", err) + } + + // parse and log responses + op.logger.Info("system tables finished staging") + tableList := make([]checkSystemTablesResponseData, 0) + return processStagedItemsResult(&op.scrutinizeOpBase, tableList) +} + +func (op *nmaCheckSystemTablesOp) shouldStopPolling() (bool, error) { + var allErrs error + isProcessing := false + + // in practice, only one host + for host, result := range op.clusterHTTPRequest.ResultCollection { + op.logResponse(host, result) + if result.isPassing() { + // passing but not success -> 202 accepted + if !result.isSuccess() { + isProcessing = true + } + } else { + allErrs = errors.Join(allErrs, result.err) + } + } + + // any errors -> finished + // 200 -> finished (isProcessing == false) + // 202 -> continue (isProcessing == true) + stop := (allErrs != nil) || !isProcessing + return stop, allErrs +} diff --git a/vclusterops/nma_download_file_op.go b/vclusterops/nma_download_file_op.go index 9acc51f..56abe5c 100644 --- a/vclusterops/nma_download_file_op.go +++ b/vclusterops/nma_download_file_op.go @@ -57,27 +57,27 @@ type downloadFileRequestData struct { // ClusterLeaseNotExpiredFailure is returned when an attempt is made to use a // communal storage before the lease for it has expired. -type clusterLeaseNotExpiredError struct { +type ClusterLeaseNotExpiredError struct { Expiration string } -func (e *clusterLeaseNotExpiredError) Error() string { +func (e *ClusterLeaseNotExpiredError) Error() string { return fmt.Sprintf("revive database cannot continue because the communal storage location might still be in use."+ " The cluster lease will expire at %s(UTC)."+ " Please ensure that the other cluster has stopped and try revive_db after the cluster lease expiration", e.Expiration) } -// reviveDBNodeCountMismatchError is returned when the number of nodes in new cluster +// ReviveDBNodeCountMismatchError is returned when the number of nodes in new cluster // does not match the number of nodes in original cluster -type reviveDBNodeCountMismatchError struct { +type ReviveDBNodeCountMismatchError struct { ReviveDBStep string FailureHost string NumOfNewNodes int NumOfOldNodes int } -func (e *reviveDBNodeCountMismatchError) Error() string { +func (e *ReviveDBNodeCountMismatchError) Error() string { return fmt.Sprintf(`[%s] nodes mismatch found on host %s: the number of the new nodes in --hosts is %d,`+ ` but the number of the old nodes in description file is %d`, e.ReviveDBStep, e.FailureHost, e.NumOfNewNodes, e.NumOfOldNodes) @@ -221,7 +221,7 @@ func (op *nmaDownloadFileOp) processResult(execContext *opEngineExecContext) err } if len(descFileContent.NodeList) != len(op.newNodes) { - err := &reviveDBNodeCountMismatchError{ + err := &ReviveDBNodeCountMismatchError{ ReviveDBStep: op.name, FailureHost: host, NumOfNewNodes: len(op.newNodes), @@ -302,7 +302,7 @@ func (op *nmaDownloadFileOp) clusterLeaseCheck(clusterLeaseExpiration string) er // current time < expire time, it means that the cluster lease is not expired if utcNow.Before(utcExpiration) { - return &clusterLeaseNotExpiredError{Expiration: clusterLeaseExpiration} + return &ClusterLeaseNotExpiredError{Expiration: clusterLeaseExpiration} } op.logger.PrintInfo("Cluster lease check has passed. We proceed to revive the database") diff --git a/vclusterops/nma_download_file_op_test.go b/vclusterops/nma_download_file_op_test.go index 7508ab1..fd0405b 100644 --- a/vclusterops/nma_download_file_op_test.go +++ b/vclusterops/nma_download_file_op_test.go @@ -33,7 +33,7 @@ func TestClusteLeaseExpiryError(t *testing.T) { err := op.clusterLeaseCheck(fakeLeaseTime.Format(expirationStringLayout)) assert.Error(t, err) // Ensure we get a specific error type back - clusterLeaseErr := &clusterLeaseNotExpiredError{} + clusterLeaseErr := &ClusterLeaseNotExpiredError{} ok := errors.As(err, &clusterLeaseErr) assert.True(t, ok) assert.Contains(t, err.Error(), "The cluster lease will expire at") diff --git a/vclusterops/nma_get_scrutinize_tar_op.go b/vclusterops/nma_get_scrutinize_tar_op.go index 1767cc3..31d4b86 100644 --- a/vclusterops/nma_get_scrutinize_tar_op.go +++ b/vclusterops/nma_get_scrutinize_tar_op.go @@ -26,6 +26,7 @@ import ( type nmaGetScrutinizeTarOp struct { scrutinizeOpBase + useInitiator bool } func makeNMAGetScrutinizeTarOp(logger vlog.Printer, @@ -54,6 +55,12 @@ func makeNMAGetScrutinizeTarOp(logger vlog.Printer, return op, err } +// useSingleHost indicates that the tarball should only be retrieved from the first +// up node +func (op *nmaGetScrutinizeTarOp) useSingleHost() { + op.useInitiator = true +} + // createOutputDir creates a subdirectory {id} under /tmp/scrutinize/remote, which // may also be created by this function. the "remote" subdirectory is created to // separate local scrutinize data staged by the NMA (placed in /tmp/scrutinize/) from @@ -75,6 +82,24 @@ func (op *nmaGetScrutinizeTarOp) createOutputDir() error { } func (op *nmaGetScrutinizeTarOp) prepare(execContext *opEngineExecContext) error { + // for the system table batch + if op.useInitiator { + if len(execContext.upHosts) == 0 { + op.logger.PrintWarning("no up hosts to collect system tables from, skipping the operation") + op.skipExecute = true + return nil + } + host := getInitiator(execContext.upHosts) + op.hosts = []string{host} + + // the initiator host should have been in the original host list, and already + // validated, but let's not assume + err := validateHostMaps(op.hosts, op.hostNodeNameMap) + if err != nil { + return err + } + } + hostToFilePathsMap := map[string]string{} for _, host := range op.hosts { hostToFilePathsMap[host] = fmt.Sprintf("%s/%s/%s-%s.tgz", diff --git a/vclusterops/nma_re_ip_op.go b/vclusterops/nma_re_ip_op.go index 2d9ffa4..b58a12e 100644 --- a/vclusterops/nma_re_ip_op.go +++ b/vclusterops/nma_re_ip_op.go @@ -25,7 +25,7 @@ import ( type nmaReIPOp struct { opBase - reIPList []reIPInfo + reIPList []ReIPInfo vdb *VCoordinationDatabase primaryNodeCount uint hostRequestBodyMap map[string]string @@ -35,7 +35,7 @@ type nmaReIPOp struct { } func makeNMAReIPOp(logger vlog.Printer, - reIPList []reIPInfo, + reIPList []ReIPInfo, vdb *VCoordinationDatabase, trimReIPData bool) nmaReIPOp { op := nmaReIPOp{} @@ -47,7 +47,7 @@ func makeNMAReIPOp(logger vlog.Printer, return op } -type reIPInfo struct { +type ReIPInfo struct { NodeName string `json:"node_name"` NodeAddress string `json:"-"` TargetAddress string `json:"address"` @@ -57,7 +57,7 @@ type reIPInfo struct { type reIPParams struct { CatalogPath string `json:"catalog_path"` - ReIPInfoList []reIPInfo `json:"re_ip_list"` + ReIPInfoList []ReIPInfo `json:"re_ip_list"` } func (op *nmaReIPOp) updateRequestBody(_ *opEngineExecContext) error { @@ -135,7 +135,7 @@ func (op *nmaReIPOp) trimReIPList(execContext *opEngineExecContext) error { nodeNamesWithLatestCatalog[vnode.Name] = struct{}{} } - var trimmedReIPList []reIPInfo + var trimmedReIPList []ReIPInfo nodesToTrim := make(map[string]string) for _, reIPInfo := range op.reIPList { if _, exist := nodeNamesWithLatestCatalog[reIPInfo.NodeName]; exist { @@ -271,7 +271,7 @@ func (op *nmaReIPOp) processResult(_ *opEngineExecContext) error { op.logResponse(host, result) if result.isPassing() { - var reIPResult []reIPInfo + var reIPResult []ReIPInfo err := op.parseAndCheckResponse(host, result.content, &reIPResult) if err != nil { err = fmt.Errorf("[%s] fail to parse result on host %s, details: %w", diff --git a/vclusterops/nma_stage_dc_tables_op.go b/vclusterops/nma_stage_dc_tables_op.go index 2439d4b..63cf669 100644 --- a/vclusterops/nma_stage_dc_tables_op.go +++ b/vclusterops/nma_stage_dc_tables_op.go @@ -99,5 +99,5 @@ func (op *nmaStageDCTablesOp) finalize(_ *opEngineExecContext) error { func (op *nmaStageDCTablesOp) processResult(_ *opEngineExecContext) error { fileList := make([]stageDCTablesResponseData, 0) - return processStagedFilesResult(&op.scrutinizeOpBase, fileList) + return processStagedItemsResult(&op.scrutinizeOpBase, fileList) } diff --git a/vclusterops/nma_stage_error_report_op.go b/vclusterops/nma_stage_error_report_op.go index ea67e94..3bc3181 100644 --- a/vclusterops/nma_stage_error_report_op.go +++ b/vclusterops/nma_stage_error_report_op.go @@ -101,5 +101,5 @@ func (op *nmaStageErrorReportOp) finalize(_ *opEngineExecContext) error { func (op *nmaStageErrorReportOp) processResult(_ *opEngineExecContext) error { fileList := make([]stageErrorReportResponseData, 0) - return processStagedFilesResult(&op.scrutinizeOpBase, fileList) + return processStagedItemsResult(&op.scrutinizeOpBase, fileList) } diff --git a/vclusterops/nma_stage_system_tables_op.go b/vclusterops/nma_stage_system_tables_op.go new file mode 100644 index 0000000..df55ae3 --- /dev/null +++ b/vclusterops/nma_stage_system_tables_op.go @@ -0,0 +1,130 @@ +/* + (c) Copyright [2023] Open Text. + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package vclusterops + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/vertica/vcluster/vclusterops/vlog" +) + +type nmaStageSystemTablesOp struct { + scrutinizeOpBase + username string + password *string +} + +type stageSystemTablesRequestData struct { + Username string `json:"username"` + Password string `json:"password,omitempty"` +} + +func makeNMAStageSystemTablesOp(logger vlog.Printer, + id, username string, + password *string, + hostNodeNameMap map[string]string) nmaStageSystemTablesOp { + // base members + op := nmaStageSystemTablesOp{} + op.name = "NMAStageSystemTablesOp" + op.logger = logger.WithName(op.name) + + // scrutinize members + op.id = id + op.batch = scrutinizeBatchSystemTables + op.hostNodeNameMap = hostNodeNameMap + op.httpMethod = PostMethod + op.urlSuffix = "/vs" + + // custom members + op.username = username + op.password = password + + return op +} + +func (op *nmaStageSystemTablesOp) setupRequestBody(host string) error { + op.hostRequestBodyMap = make(map[string]string, 1) + data := stageSystemTablesRequestData{} + data.Username = op.username + if op.password != nil { + data.Password = *op.password + } + + dataBytes, err := json.Marshal(data) + if err != nil { + return fmt.Errorf("[%s] fail to marshal request data to JSON string, detail %w", op.name, err) + } + + op.hostRequestBodyMap[host] = string(dataBytes) + + return nil +} + +func (op *nmaStageSystemTablesOp) prepare(execContext *opEngineExecContext) error { + // only run this task on a single up node + // if there are no up nodes, skip this task + if len(execContext.upHosts) == 0 { + op.logger.PrintWarning("no up hosts to collect system tables from, skipping the operation") + op.skipExecute = true + return nil + } + host := getInitiator(execContext.upHosts) + + err := op.setupRequestBody(host) + if err != nil { + return err + } + + // construct host list for interface purposes + op.hosts = []string{host} + err = validateHostMaps(op.hosts, op.hostNodeNameMap) + if err != nil { + return err + } + + execContext.dispatcher.setup(op.hosts) + return op.setupClusterHTTPRequest(op.hosts) +} + +func (op *nmaStageSystemTablesOp) execute(execContext *opEngineExecContext) error { + if err := op.runExecute(execContext); err != nil { + return err + } + + return op.processResult(execContext) +} + +func (op *nmaStageSystemTablesOp) finalize(_ *opEngineExecContext) error { + return nil +} + +func (op *nmaStageSystemTablesOp) processResult(_ *opEngineExecContext) error { + var allErrs error + + for host, result := range op.clusterHTTPRequest.ResultCollection { + op.logResponse(host, result) + if result.isPassing() { + // the body should be empty in non-error conditions + op.logger.Info("System table staging initiated", "Host", host) + } else { + allErrs = errors.Join(allErrs, result.err) + } + } + + return allErrs +} diff --git a/vclusterops/nma_stage_vertica_logs_op.go b/vclusterops/nma_stage_vertica_logs_op.go index 5da767c..52c9d6e 100644 --- a/vclusterops/nma_stage_vertica_logs_op.go +++ b/vclusterops/nma_stage_vertica_logs_op.go @@ -113,5 +113,5 @@ func (op *nmaStageVerticaLogsOp) finalize(_ *opEngineExecContext) error { func (op *nmaStageVerticaLogsOp) processResult(_ *opEngineExecContext) error { fileList := make([]stageVerticaLogsResponseData, 0) - return processStagedFilesResult(&op.scrutinizeOpBase, fileList) + return processStagedItemsResult(&op.scrutinizeOpBase, fileList) } diff --git a/vclusterops/re_ip.go b/vclusterops/re_ip.go index 26809de..6e6897b 100644 --- a/vclusterops/re_ip.go +++ b/vclusterops/re_ip.go @@ -29,7 +29,7 @@ type VReIPOptions struct { DatabaseOptions // re-ip list - ReIPList []reIPInfo + ReIPList []ReIPInfo /* hidden option */ @@ -274,7 +274,7 @@ func (opt *VReIPOptions) ReadReIPFile(path string) error { ipv6 := opt.Ipv6.ToBool() for _, row := range reIPRows { - var info reIPInfo + var info ReIPInfo info.NodeAddress = row.CurrentAddress if e := addressCheck(row.CurrentAddress, ipv6); e != nil { return e diff --git a/vclusterops/re_ip_test.go b/vclusterops/re_ip_test.go index fab06bb..cd5ac60 100644 --- a/vclusterops/re_ip_test.go +++ b/vclusterops/re_ip_test.go @@ -39,7 +39,7 @@ func TestReIPOptions(t *testing.T) { err = opt.validateAnalyzeOptions(vlog.Printer{}) assert.ErrorContains(t, err, "the re-ip list is not provided") - var info reIPInfo + var info ReIPInfo info.NodeAddress = "192.168.1.102" info.TargetAddress = "192.168.1.103" opt.ReIPList = append(opt.ReIPList, info) @@ -89,7 +89,7 @@ func TestTrimReIPList(t *testing.T) { // build a stub re-ip list // which has an extra node compared to the actual NmaVDatabase for i := 0; i < 4; i++ { - var info reIPInfo + var info ReIPInfo info.NodeName = fmt.Sprintf("v_%s_node000%d", dbName, i+1) info.NodeAddress = fmt.Sprintf("vnode%d", i+1) info.TargetAddress = fmt.Sprintf("vnode_new_%d", i+1) diff --git a/vclusterops/scrutinize.go b/vclusterops/scrutinize.go index 91a1f60..b5ba973 100644 --- a/vclusterops/scrutinize.go +++ b/vclusterops/scrutinize.go @@ -40,6 +40,7 @@ const scrutinizeLogLimitBytes = 10737418240 // 10GB in bytes // batches are fixed, top level folders for each node's data const scrutinizeBatchNormal = "normal" const scrutinizeBatchContext = "context" +const scrutinizeBatchSystemTables = "system_tables" type VScrutinizeOptions struct { DatabaseOptions @@ -198,16 +199,16 @@ func tarAndRemoveDirectory(id string, log vlog.Printer) (err error) { func (options *VScrutinizeOptions) getVDBForScrutinize(logger vlog.Printer, vdb *VCoordinationDatabase) error { // get nodes where NMA is running and only use those for NMA ops - nmaGetHealthyNodesOp := makeNMAGetHealthyNodesOp(logger, options.Hosts, vdb) - err := options.runClusterOpEngine(logger, []clusterOp{&nmaGetHealthyNodesOp}) + getHealthyNodesOp := makeNMAGetHealthyNodesOp(logger, options.Hosts, vdb) + err := options.runClusterOpEngine(logger, []clusterOp{&getHealthyNodesOp}) if err != nil { return err } // get map of host to node name and fully qualified catalog path - nmaGetNodesInfoOp := makeNMAGetNodesInfoOp(logger, vdb.HostList, *options.DBName, + getNodesInfoOp := makeNMAGetNodesInfoOp(logger, vdb.HostList, *options.DBName, *options.CatalogPrefix, true /* ignore internal errors */, vdb) - err = options.runClusterOpEngine(logger, []clusterOp{&nmaGetNodesInfoOp}) + err = options.runClusterOpEngine(logger, []clusterOp{&getNodesInfoOp}) if err != nil { return err } @@ -232,14 +233,14 @@ func (options *VScrutinizeOptions) getVDBForScrutinize(logger vlog.Printer, // The generated instructions will later perform the following operations necessary // for a successful scrutinize: // - Get up nodes through https call -// - TODO Initiate system table staging on the first up node, if available +// - Initiate system table staging on the first up node, if available // - Stage vertica logs on all nodes // - Stage ErrorReport.txt on all nodes -// - TODO Stage DC tables on all nodes -// - Tar and retrieve vertica logs (TODO + DC tables) from all nodes (batch normal) +// - Stage DC tables on all nodes +// - Tar and retrieve vertica logs and DC tables from all nodes (batch normal) // - Tar and retrieve error report from all nodes (batch context) -// - TODO (If applicable) Poll for system table staging completion on task node -// - TODO (If applicable) Tar and retrieve system tables from task node (batch system_tables) +// - (If applicable) Poll for system table staging completion on task node +// - (If applicable) Tar and retrieve system tables from task node (batch system_tables) func (vcc *VClusterCommands) produceScrutinizeInstructions(options *VScrutinizeOptions, vdb *VCoordinationDatabase) (instructions []clusterOp, err error) { // extract needed info from vdb @@ -248,40 +249,45 @@ func (vcc *VClusterCommands) produceScrutinizeInstructions(options *VScrutinizeO return nil, fmt.Errorf("failed to process retrieved node info, details %w", err) } - // Get up database nodes for the system table task later - httpsGetUpNodesOp, err := makeHTTPSGetUpNodesOp(vcc.Log, *options.DBName, options.Hosts, + // Get up database nodes for the system table task + getUpNodesOp, err := makeHTTPSGetUpNodesOp(vcc.Log, *options.DBName, options.Hosts, options.usePassword, *options.UserName, options.Password) if err != nil { return nil, err } - httpsGetUpNodesOp.allowNoUpHosts() - instructions = append(instructions, &httpsGetUpNodesOp) + getUpNodesOp.allowNoUpHosts() + instructions = append(instructions, &getUpNodesOp) + + // Initiate system table staging early as it may take significantly longer than other ops + stageSystemTablesOp := makeNMAStageSystemTablesOp(vcc.Log, options.ID, *options.UserName, + options.Password, hostNodeNameMap) + instructions = append(instructions, &stageSystemTablesOp) // stage Vertica logs - nmaStageVerticaLogsOp, err := makeNMAStageVerticaLogsOp(vcc.Log, options.ID, options.Hosts, + stageVerticaLogsOp, err := makeNMAStageVerticaLogsOp(vcc.Log, options.ID, options.Hosts, hostNodeNameMap, hostCatPathMap, scrutinizeLogLimitBytes, scrutinizeLogAgeHours) if err != nil { // map invariant assertion failure -- should not occur return nil, err } - instructions = append(instructions, &nmaStageVerticaLogsOp) + instructions = append(instructions, &stageVerticaLogsOp) // stage DC Tables - nmaStageDCTablesOp, err := makeNMAStageDCTablesOp(vcc.Log, options.ID, options.Hosts, + stageDCTablesOp, err := makeNMAStageDCTablesOp(vcc.Log, options.ID, options.Hosts, hostNodeNameMap, hostCatPathMap) if err != nil { // map invariant assertion failure -- should not occur return nil, err } - instructions = append(instructions, &nmaStageDCTablesOp) + instructions = append(instructions, &stageDCTablesOp) // stage ErrorReport.txt - nmaStageVerticaErrorReportOp, err := makeNMAStageErrorReportOp(vcc.Log, options.ID, options.Hosts, + stageVerticaErrorReportOp, err := makeNMAStageErrorReportOp(vcc.Log, options.ID, options.Hosts, hostNodeNameMap, hostCatPathMap) if err != nil { return nil, err } - instructions = append(instructions, &nmaStageVerticaErrorReportOp) + instructions = append(instructions, &stageVerticaErrorReportOp) // get 'normal' batch tarball (inc. Vertica logs) getNormalTarballOp, err := makeNMAGetScrutinizeTarOp(vcc.Log, options.ID, scrutinizeBatchNormal, @@ -299,6 +305,19 @@ func (vcc *VClusterCommands) produceScrutinizeInstructions(options *VScrutinizeO } instructions = append(instructions, &getContextTarballOp) + // check for system tables staging completion before continuing + checkSystemTablesOp := makeNMACheckSystemTablesOp(vcc.Log, options.ID, hostNodeNameMap) + instructions = append(instructions, &checkSystemTablesOp) + + // get 'system_tables' batch tarball last, as staging systables can take a long time + getSystemTablesTarballOp, err := makeNMAGetScrutinizeTarOp(vcc.Log, options.ID, scrutinizeBatchSystemTables, + options.Hosts, hostNodeNameMap) + if err != nil { + return nil, err + } + getSystemTablesTarballOp.useSingleHost() + instructions = append(instructions, &getSystemTablesTarballOp) + return instructions, nil } diff --git a/vclusterops/scrutinize_op.go b/vclusterops/scrutinize_op.go index e4f8e4f..f41dbc1 100644 --- a/vclusterops/scrutinize_op.go +++ b/vclusterops/scrutinize_op.go @@ -52,9 +52,10 @@ func (op *scrutinizeOpBase) setupClusterHTTPRequest(hosts []string) error { return nil } -// processeStagedFilesResult is a parameterized function which contains common logic -// for processing the results of staging various types of files, e.g. vertica.log -func processStagedFilesResult[T any](op *scrutinizeOpBase, fileList []T) error { +// processeStagedItemsResult is a parameterized function which contains common logic +// for processing the results of staging various types of items, e.g. vertica.log, +// system tables, etc. +func processStagedItemsResult[T any](op *scrutinizeOpBase, itemList []T) error { var allErrs error for host, result := range op.clusterHTTPRequest.ResultCollection { @@ -65,15 +66,15 @@ func processStagedFilesResult[T any](op *scrutinizeOpBase, fileList []T) error { op.logger.Info("nothing staged on host", "Host", host) continue } - // the response is an array of file info structs - err := op.parseAndCheckResponse(host, result.content, &fileList) + // the response is an array of item info structs + err := op.parseAndCheckResponse(host, result.content, &itemList) if err != nil { err = fmt.Errorf("[%s] fail to parse result on host %s, details: %w", op.name, host, err) allErrs = errors.Join(allErrs, err) continue } - for _, fileEntry := range fileList { - op.logger.Info("file staged on host", "Host", host, "FileInfo", fileEntry) + for _, entry := range itemList { + op.logger.Info("item staged on host", "Host", host, "Item", entry) } } else { allErrs = errors.Join(allErrs, result.err) diff --git a/vclusterops/state_poller.go b/vclusterops/state_poller.go index 4dfe37a..9e599a1 100644 --- a/vclusterops/state_poller.go +++ b/vclusterops/state_poller.go @@ -21,11 +21,12 @@ import ( ) const ( - OneSecond = 1 - OneMinute = 60 * OneSecond - StopDBTimeout = 5 * OneMinute - StartupPollingTimeout = 5 * OneMinute - PollingInterval = 3 * OneSecond + OneSecond = 1 + OneMinute = 60 * OneSecond + StopDBTimeout = 5 * OneMinute + StartupPollingTimeout = 5 * OneMinute + ScrutinizePollingTimeout = -1 * OneMinute // no timeout + PollingInterval = 3 * OneSecond ) type statePoller interface { @@ -34,15 +35,20 @@ type statePoller interface { runExecute(execContext *opEngineExecContext) error } -// pollState is a helper function to poll state for all ops that implement the statePoller interface +// pollState is a helper function to poll state for all ops that implement the StatePoller interface. +// If poller.getPollingTimeout() returns a value < 0, pollState will poll forever. func pollState(poller statePoller, execContext *opEngineExecContext) error { startTime := time.Now() timeout := poller.getPollingTimeout() duration := time.Duration(timeout) * time.Second count := 0 + needTimeout := true + if timeout < 0 { + needTimeout = false + } for endTime := startTime.Add(duration); ; { - if time.Now().After(endTime) { + if needTimeout && time.Now().After(endTime) { break } diff --git a/vclusterops/util/util.go b/vclusterops/util/util.go index d1879c3..183620e 100644 --- a/vclusterops/util/util.go +++ b/vclusterops/util/util.go @@ -30,6 +30,7 @@ import ( "regexp" "strings" + "golang.org/x/exp/constraints" "golang.org/x/exp/slices" "golang.org/x/sys/unix" @@ -529,3 +530,12 @@ func ValidateCommunalStorageLocation(location string) error { return nil } + +// Max works on all sane types, not just float64 like the math package funcs. +// Can be removed after upgrade to go 1.21 (VER-90410) as min/max become builtins. +func Max[T constraints.Ordered](a, b T) T { + if a > b { + return a + } + return b +}