diff --git a/CLAUDE.md b/CLAUDE.md index 4f6db858..78e7f132 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -316,6 +316,33 @@ performance reasons. 4. Update client API in `src/client/apis/` 5. Add CLI command handler if needed in `src/client/commands/` +### Adding a New CLI Subcommand + +For a new subcommand (e.g., `torc workflows correct-resources`): + +1. **Implement handler function** in `src/client/commands/{module}.rs` + - Follow existing pattern from other commands in the same file + - Use `#[command(...)]` attributes for clap configuration + +2. **Add to enum variant** in the `#[derive(Subcommand)]` enum + - Add field struct with `#[arg(...)]` attributes for options/flags + - Use `#[command(name = "...")]` to set the subcommand name + +3. **Update help template** (if applicable) + - For `workflows` commands: Update `WORKFLOWS_HELP_TEMPLATE` constant at top of file + - Add entry to the appropriate category with description (format: `command_name Description`) + - Use ANSI color codes for consistency: `\x1b[1;36m` for command, `\x1b[1;32m` for category + +4. **Remove `hide = true`** if command should be visible + - Default behavior shows command in help unless explicitly hidden + +5. **Add well-formatted help text** in `#[command(...)]` attribute + - Use `after_long_help = "..."` for detailed examples + - Examples are shown when user runs `torc workflows command-name --help` + +6. **Wire up in match statement** + - Add case in the match block that calls your handler function (usually around line 3400+) + ### Creating a Workflow from Specification 1. Write workflow spec file (JSON/JSON5/YAML) following `WorkflowSpec` format @@ -448,9 +475,11 @@ unified CLI. - `torc submit-slurm --account ` - Submit with auto-generated Slurm schedulers - `torc tui` - Interactive terminal UI -**Utilities**: +**Reports & Analysis**: -- `torc reports ` - Generate reports +- `torc reports check-resource-utilization ` - Check which jobs exceeded resource limits +- `torc reports results ` - Get detailed job execution results +- `torc reports summary ` - Get workflow completion summary **Global Options** (available on all commands): diff --git a/api/openapi.yaml b/api/openapi.yaml index 8a7187b2..326262f1 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -5992,6 +5992,12 @@ components: timestamp: description: Timestamp of workflow creation type: string + project: + description: Project name or identifier for grouping workflows + type: string + metadata: + description: Arbitrary metadata as JSON string + type: string compute_node_expiration_buffer_seconds: default: 180 diff --git a/docs/src/core/how-to/check-resource-utilization.md b/docs/src/core/how-to/check-resource-utilization.md index ba8189ae..4a5a06f6 100644 --- a/docs/src/core/how-to/check-resource-utilization.md +++ b/docs/src/core/how-to/check-resource-utilization.md @@ -36,22 +36,80 @@ For workflows that have been reinitialized multiple times: torc reports check-resource-utilization --run-id 2 ``` -## Adjusting Requirements +## Automatically Correct Requirements -When jobs exceed their limits, update your workflow specification with a buffer: +Use the separate `correct-resources` command to automatically adjust resource allocations based on +actual resource measurements: + +```bash +torc workflows correct-resources +``` + +This analyzes both completed and failed jobs to detect: + +- **Memory violations** — Jobs using more memory than allocated +- **CPU violations** — Jobs using more CPU than allocated +- **Runtime violations** — Jobs running longer than allocated time + +The command will: + +- Calculate new requirements using actual peak usage data +- Apply a 1.2x safety multiplier to each resource +- Update the workflow's resource requirements for future runs + +Example: + +``` +Analyzing and correcting resource requirements for workflow 5 +✓ Resource requirements updated successfully + +Corrections applied: + memory_training: 8g → 10g (+25.0%) + cpu_training: 4 → 5 cores (+25.0%) + runtime_training: PT2H → PT2H30M (+25.0%) +``` + +### Preview Changes Without Applying + +Use `--dry-run` to see what changes would be made: + +```bash +torc workflows correct-resources --dry-run +``` + +### Correct Only Specific Jobs + +To update only certain jobs (by ID): + +```bash +torc workflows correct-resources --job-ids 15,16,18 +``` + +### Custom Correction Multiplier + +Adjust the safety margin (default 1.2x): + +```bash +torc workflows correct-resources --memory-multiplier 1.5 --runtime-multiplier 1.4 +``` + +## Manual Adjustment + +For more control, update your workflow specification with a buffer: ```yaml resource_requirements: - name: training memory: 12g # 10.5 GB peak + 15% buffer runtime: PT3H # 2h 45m actual + buffer + num_cpus: 7 # Enough for peak CPU usage ``` **Guidelines:** - Memory: Add 10-20% above peak usage - Runtime: Add 15-30% above actual duration -- CPU: Round up to next core count +- CPU: Round up to accommodate peak percentage (e.g., 501% CPU → 6 cores) ## See Also diff --git a/docs/src/core/reference/cli-cheatsheet.md b/docs/src/core/reference/cli-cheatsheet.md index 1c195385..49b4f7c8 100644 --- a/docs/src/core/reference/cli-cheatsheet.md +++ b/docs/src/core/reference/cli-cheatsheet.md @@ -43,6 +43,7 @@ | `torc watch --recover --auto-schedule` | Full production recovery mode | | `torc workflows sync-status ` | Fix orphaned jobs (stuck in "running") | | `torc reports check-resource-utilization ` | Check memory/CPU/time usage | +| `torc workflows correct-resources ` | Auto-correct resource requirements | | `torc reports summary ` | Workflow completion summary | | `torc reports results ` | JSON report of job results with log paths | | `torc slurm sacct ` | Get Slurm accounting data | diff --git a/docs/src/core/reference/cli.md b/docs/src/core/reference/cli.md index 3c95af66..3a1d4ea3 100644 --- a/docs/src/core/reference/cli.md +++ b/docs/src/core/reference/cli.md @@ -1,15 +1,5 @@ # CLI Reference -This documentation is automatically generated from the CLI help text. - -To regenerate, run: - -```bash -cargo run --bin generate-cli-docs --features "client,tui,plot_resources" -``` - -# Command-Line Help for `torc` - This document contains the help content for the `torc` command-line program. **Command Overview:** @@ -1967,6 +1957,8 @@ Check resource utilization and report jobs that exceeded their specified require - `-r`, `--run-id ` — Run ID to analyze (optional - analyzes latest run if not provided) - `-a`, `--all` — Show all jobs (default: only show jobs that exceeded requirements) - `--include-failed` — Include failed and terminated jobs in the analysis (for recovery diagnostics) +- `--min-over-utilization ` — Minimum over-utilization percentage to flag as + violation (default: 1.0%) ## `torc reports results` diff --git a/docs/src/core/reference/workflow-spec.md b/docs/src/core/reference/workflow-spec.md index 6aab9c42..f932f932 100644 --- a/docs/src/core/reference/workflow-spec.md +++ b/docs/src/core/reference/workflow-spec.md @@ -12,6 +12,8 @@ The top-level container for a complete workflow definition. | `name` | string | _required_ | Name of the workflow | | `user` | string | current user | User who owns this workflow | | `description` | string | none | Description of the workflow | +| `project` | string | none | Project name or identifier for grouping workflows | +| `metadata` | string | none | Arbitrary metadata as JSON string | | `parameters` | map\ | none | Shared parameters that can be used by jobs and files via `use_parameters` | | `jobs` | [[JobSpec](#jobspec)] | _required_ | Jobs that make up this workflow | | `files` | [[FileSpec](#filespec)] | none | Files associated with this workflow | @@ -29,6 +31,44 @@ The top-level container for a complete workflow definition. | `compute_node_wait_for_healthy_database_minutes` | integer | none | Compute nodes wait this many minutes for database recovery | | `jobs_sort_method` | [ClaimJobsSortMethod](#claimjobssortmethod) | `none` | Method for sorting jobs when claiming them | +### Examples with project and metadata + +The `project` and `metadata` fields are useful for organizing and categorizing workflows. For more +detailed guidance on organizing workflows, see +[Organizing and Managing Workflows](../workflows/organizing-workflows.md). + +**YAML example:** + +```yaml +name: "ml_training_workflow" +project: "customer-churn-prediction" +metadata: '{"environment":"staging","version":"1.0.0","team":"ml-engineering"}' +description: "Train and evaluate churn prediction model" +jobs: + - name: "preprocess" + command: "python preprocess.py" + - name: "train" + command: "python train.py" + depends_on: ["preprocess"] +``` + +**JSON example:** + +```json +{ + "name": "data_pipeline", + "project": "analytics-platform", + "metadata": "{\"cost_center\":\"eng-data\",\"priority\":\"high\"}", + "description": "Daily data processing pipeline", + "jobs": [ + { + "name": "extract", + "command": "python extract.py" + } + ] +} +``` + ## JobSpec Defines a single computational task within a workflow. diff --git a/docs/src/core/workflows/index.md b/docs/src/core/workflows/index.md index bf8b43ae..b6bc784d 100644 --- a/docs/src/core/workflows/index.md +++ b/docs/src/core/workflows/index.md @@ -2,8 +2,15 @@ This section covers how to create, configure, and manage workflows. +## Creating and Formatting + - [Creating Workflows](./creating-workflows.md) - Getting started with workflow creation - [Workflow Specification Formats](./workflow-formats.md) - JSON, YAML, and other formats + +## Organization and Management + +- [Organizing and Managing Workflows](./organizing-workflows.md) - Using project and metadata fields + to categorize and track workflows - [Visualizing Workflow Structure](./visualizing-workflows.md) - Viewing workflow graphs - [Exporting and Importing Workflows](./export-import-workflows.md) - Moving workflows between systems diff --git a/docs/src/core/workflows/organizing-workflows.md b/docs/src/core/workflows/organizing-workflows.md new file mode 100644 index 00000000..07407d67 --- /dev/null +++ b/docs/src/core/workflows/organizing-workflows.md @@ -0,0 +1,330 @@ +# Organizing and Managing Workflows + +Workflows can be organized and managed using the `project` and `metadata` fields, which help you +categorize, track, and manage workflows across your organization. + +## Overview + +- **`project`**: A string field for grouping related workflows (e.g., by department, client, or + initiative) +- **`metadata`**: A JSON string field for storing arbitrary custom information about the workflow + +Both fields are optional and can be set when creating a workflow or updated at any time. + +## Using the Project Field + +The `project` field is useful for grouping related workflows together. Common use cases: + +### By Department or Team + +```yaml +name: "data_validation" +project: "data-engineering" +jobs: + - name: "validate" + command: "python validate.py" +``` + +### By Client or Customer + +```yaml +name: "client_report_generation" +project: "acme-corp" +description: "Monthly analytics reports for ACME Corp" +jobs: + - name: "extract" + command: "python extract_data.py" + - name: "report" + command: "python generate_report.py" + depends_on: ["extract"] +``` + +### By Application or Service + +```yaml +name: "model_retraining" +project: "recommendation-engine" +jobs: + - name: "collect_data" + command: "python collect_training_data.py" + - name: "train" + command: "python train_model.py" + depends_on: ["collect_data"] +``` + +## Using the Metadata Field + +The `metadata` field stores arbitrary JSON data, making it perfect for tracking additional context +about your workflow. + +### Environment and Version Tracking + +```yaml +name: "ml_pipeline" +project: "ai-models" +metadata: '{"environment":"production","model_version":"2.1.0","framework":"pytorch"}' +jobs: + - name: "preprocess" + command: "python preprocess.py" +``` + +### Cost and Billing Tracking + +```yaml +name: "data_processing" +project: "analytics-platform" +metadata: '{"cost_center":"eng-data","billing_code":"proj-2024-001","budget":"$5000"}' +jobs: + - name: "process" + command: "python process_data.py" +``` + +### Ownership and Scheduling Information + +```yaml +name: "hourly_sync" +project: "data-platform" +metadata: | + { + "owner": "data-platform-team", + "schedule": "0 * * * *", + "critical": true, + "on_call": "oncall@company.com", + "sla": "15 minutes" + } +jobs: + - name: "sync" + command: "python sync_data.py" +``` + +### Testing and Deployment Metadata + +```yaml +name: "integration_tests" +project: "backend-api" +metadata: | + { + "test_type": "integration", + "test_suite": "api_v2", + "required_before_deploy": true, + "notifications": ["slack:devops", "email:qa@company.com"], + "timeout_minutes": 30 + } +jobs: + - name: "run_tests" + command: "pytest tests/integration/" +``` + +## Querying and Filtering Workflows + +### Listing Workflows with Project and Metadata + +```bash +# View all workflows with their project and metadata +torc workflows list + +# View in JSON format for programmatic access +torc workflows list -f json + +# Get details of a specific workflow +torc workflows get -f json +``` + +### JSON Output Example + +```json +{ + "id": 42, + "name": "ml_training_pipeline", + "project": "customer-ai", + "metadata": { + "environment": "staging", + "version": "1.2.0", + "owner": "ml-team" + }, + "description": "Train and evaluate models", + "user": "alice", + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +## Updating Workflow Organization + +You can update the `project` and `metadata` fields at any time: + +### Update Project + +```bash +# Change workflow project +torc workflows update --project "new-department" +``` + +### Update Metadata + +```bash +# Update metadata with new information +torc workflows update --metadata '{"stage":"production","last_review":"2024-01-15"}' + +# Or add additional metadata +torc workflows update --metadata '{"owner":"new-team","priority":"high"}' +``` + +### Update Both + +```bash +torc workflows update --project "sales" --metadata '{"client":"xyz-corp","contract":"2024-001"}' +``` + +## Workflow Organization Best Practices + +### 1. Consistent Project Naming + +Use consistent naming conventions for projects to make filtering easier: + +```yaml +# Good: lowercase, dash-separated +project: "customer-data-pipeline" +project: "ml-model-training" +project: "backend-api-tests" + +# Avoid: inconsistent naming +project: "Customer Data Pipeline" +project: "ML_Model_Training" +project: "bApiT" +``` + +### 2. Structured Metadata + +Store metadata as valid JSON objects for better programmatic access: + +```yaml +# Good: valid JSON object +metadata: '{"owner":"team","env":"prod","version":"1.0"}' + +# Avoid: unstructured strings +metadata: 'owner=team, env=prod, version 1.0' +``` + +### 3. Document Metadata Schema + +Keep documentation of what metadata fields your organization uses: + +```yaml +metadata: | + { + "team": "string - owning team", + "environment": "string - deployment environment", + "version": "semver - software version", + "critical": "boolean - mission-critical", + "review_date": "ISO8601 - last review date" + } +``` + +### 4. Combine Project and Metadata + +Use project for high-level categorization and metadata for detailed information: + +```yaml +name: "customer_analytics" +project: "analytics-platform" # High-level grouping +metadata: | # Detailed information + { + "customer_id": "ACME-2024", + "region": "us-west", + "sla": "99.9%", + "cost_center": "analytics" + } +``` + +## Examples + +### Data Engineering Workflow + +```yaml +name: "daily_etl_pipeline" +project: "data-warehouse" +metadata: | + { + "schedule": "daily", + "run_time": "02:00 UTC", + "owner": "data-engineering-team", + "estimated_duration_minutes": 45, + "source_systems": ["salesforce", "zendesk"], + "target_table": "customer_analytics", + "sla_hours": 4 + } +description: "Daily ETL process for customer analytics data" +jobs: + - name: "extract_salesforce" + command: "python extract_salesforce.py" + - name: "extract_zendesk" + command: "python extract_zendesk.py" + - name: "transform_and_load" + command: "python transform_load.py" + depends_on: ["extract_salesforce", "extract_zendesk"] +``` + +### Machine Learning Workflow + +```yaml +name: "model_training_pipeline" +project: "recommendation-engine" +metadata: | + { + "model_type": "collaborative_filtering", + "framework": "pytorch", + "training_date": "2024-01-15", + "dataset_version": "v3.2.0", + "hyperparameters": { + "learning_rate": 0.001, + "batch_size": 64, + "epochs": 100 + }, + "metrics": { + "accuracy": 0.94, + "precision": 0.91, + "recall": 0.93 + }, + "next_review": "2024-04-15" + } +description: "Train recommendation models" +jobs: + - name: "prepare_data" + command: "python prepare_training_data.py" + - name: "train_model" + command: "python train.py" + depends_on: ["prepare_data"] + - name: "evaluate" + command: "python evaluate.py" + depends_on: ["train_model"] +``` + +### Testing and CI/CD Workflow + +```yaml +name: "backend_integration_tests" +project: "backend-api" +metadata: | + { + "ci_cd_integration": true, + "test_framework": "pytest", + "coverage_requirement": 85, + "notifications": { + "on_failure": ["slack:devops"], + "on_success": ["slack:dev-team"] + }, + "blocking_deployment": true, + "timeout_minutes": 30, + "retry_count": 2 + } +description: "Run integration tests for backend API" +jobs: + - name: "setup_test_db" + command: "python setup_test_db.py" + - name: "run_tests" + command: "pytest tests/integration/ -v" + depends_on: ["setup_test_db"] + - name: "generate_coverage_report" + command: "coverage report --html" + depends_on: ["run_tests"] +``` diff --git a/docs/src/getting-started/installation.md b/docs/src/getting-started/installation.md index 2886beea..514e44a6 100644 --- a/docs/src/getting-started/installation.md +++ b/docs/src/getting-started/installation.md @@ -73,14 +73,17 @@ Or add to your `~/.bashrc` for persistence: echo 'export PATH="/scratch/dthom/torc/latest:$PATH"' >> ~/.bashrc ``` -**Shared server**: A `torc-server` instance runs on a dedicated VM within the Kestrel environment. -Contact Daniel Thom for access credentials and the server URL. Once you have access: +**Shared server**: A `torc-server` instance runs on a dedicated VM within the Kestrel environment at +`torc.hpc.nrel.gov`. Contact Daniel Thom if you would like to create an access group to store +private workflows for your team. ```bash -export TORC_API_URL="http:///torc-service/v1" -export TORC_PASSWORD="" +export TORC_API_URL="http://torc.hpc.nrel.gov:8080/torc-service/v1" ``` +**Note:** Be mindful of how much data you store in this database. Delete workflows that you don't +need any longer. Refer to `torc workflows export --help` for making backups of your workflows. + ## Building from Source ### Prerequisites diff --git a/julia_client/Torc/src/api/models/model_WorkflowModel.jl b/julia_client/Torc/src/api/models/model_WorkflowModel.jl index c2cac6ee..1ad9f663 100644 --- a/julia_client/Torc/src/api/models/model_WorkflowModel.jl +++ b/julia_client/Torc/src/api/models/model_WorkflowModel.jl @@ -10,6 +10,8 @@ user=nothing, description=nothing, timestamp=nothing, + project=nothing, + metadata=nothing, compute_node_expiration_buffer_seconds=180, compute_node_wait_for_new_jobs_seconds=90, compute_node_ignore_workflow_completion=false, @@ -27,6 +29,8 @@ - user::String : User that created the workflow - description::String : Description of the workflow - timestamp::String : Timestamp of workflow creation + - project::String : Project name or identifier for grouping workflows + - metadata::String : Arbitrary metadata as JSON string - compute_node_expiration_buffer_seconds::Int64 : Inform all compute nodes to shut down this number of seconds before the expiration time. This allows torc to send SIGTERM to all job processes and set all statuses to terminated. Increase the time in cases where the job processes handle SIGTERM and need more time to gracefully shut down. Set the value to 0 to maximize the time given to jobs. If not set, take the database's default value of 90 seconds. - compute_node_wait_for_new_jobs_seconds::Int64 : Inform all compute nodes to wait for new jobs for this time period before exiting. Does not apply if the workflow is complete. Default must be >= completion_check_interval_secs + job_completion_poll_interval to avoid exiting before dependent jobs are unblocked. - compute_node_ignore_workflow_completion::Bool : Inform all compute nodes to ignore workflow completions and hold onto allocations indefinitely. Useful for debugging failed jobs and possibly dynamic workflows where jobs get added after starting. @@ -44,6 +48,8 @@ Base.@kwdef mutable struct WorkflowModel <: OpenAPI.APIModel user::Union{Nothing, String} = nothing description::Union{Nothing, String} = nothing timestamp::Union{Nothing, String} = nothing + project::Union{Nothing, String} = nothing + metadata::Union{Nothing, String} = nothing compute_node_expiration_buffer_seconds::Union{Nothing, Int64} = 180 compute_node_wait_for_new_jobs_seconds::Union{Nothing, Int64} = 90 compute_node_ignore_workflow_completion::Union{Nothing, Bool} = false @@ -55,14 +61,14 @@ Base.@kwdef mutable struct WorkflowModel <: OpenAPI.APIModel use_pending_failed::Union{Nothing, Bool} = false status_id::Union{Nothing, Int64} = nothing - function WorkflowModel(id, name, user, description, timestamp, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, status_id, ) - o = new(id, name, user, description, timestamp, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, status_id, ) + function WorkflowModel(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, status_id, ) + o = new(id, name, user, description, timestamp, project, metadata, compute_node_expiration_buffer_seconds, compute_node_wait_for_new_jobs_seconds, compute_node_ignore_workflow_completion, compute_node_wait_for_healthy_database_minutes, compute_node_min_time_for_new_jobs_seconds, jobs_sort_method, resource_monitor_config, slurm_defaults, use_pending_failed, status_id, ) OpenAPI.validate_properties(o) return o end end # type WorkflowModel -const _property_types_WorkflowModel = Dict{Symbol,String}(Symbol("id")=>"Int64", Symbol("name")=>"String", Symbol("user")=>"String", Symbol("description")=>"String", Symbol("timestamp")=>"String", Symbol("compute_node_expiration_buffer_seconds")=>"Int64", Symbol("compute_node_wait_for_new_jobs_seconds")=>"Int64", Symbol("compute_node_ignore_workflow_completion")=>"Bool", Symbol("compute_node_wait_for_healthy_database_minutes")=>"Int64", Symbol("compute_node_min_time_for_new_jobs_seconds")=>"Int64", Symbol("jobs_sort_method")=>"JobsSortMethod", Symbol("resource_monitor_config")=>"String", Symbol("slurm_defaults")=>"String", Symbol("use_pending_failed")=>"Bool", Symbol("status_id")=>"Int64", ) +const _property_types_WorkflowModel = Dict{Symbol,String}(Symbol("id")=>"Int64", Symbol("name")=>"String", Symbol("user")=>"String", Symbol("description")=>"String", Symbol("timestamp")=>"String", Symbol("project")=>"String", Symbol("metadata")=>"String", Symbol("compute_node_expiration_buffer_seconds")=>"Int64", Symbol("compute_node_wait_for_new_jobs_seconds")=>"Int64", Symbol("compute_node_ignore_workflow_completion")=>"Bool", Symbol("compute_node_wait_for_healthy_database_minutes")=>"Int64", Symbol("compute_node_min_time_for_new_jobs_seconds")=>"Int64", Symbol("jobs_sort_method")=>"JobsSortMethod", Symbol("resource_monitor_config")=>"String", Symbol("slurm_defaults")=>"String", Symbol("use_pending_failed")=>"Bool", Symbol("status_id")=>"Int64", ) OpenAPI.property_type(::Type{ WorkflowModel }, name::Symbol) = Union{Nothing,eval(Base.Meta.parse(_property_types_WorkflowModel[name]))} function OpenAPI.check_required(o::WorkflowModel) @@ -77,6 +83,8 @@ function OpenAPI.validate_properties(o::WorkflowModel) OpenAPI.validate_property(WorkflowModel, Symbol("user"), o.user) OpenAPI.validate_property(WorkflowModel, Symbol("description"), o.description) OpenAPI.validate_property(WorkflowModel, Symbol("timestamp"), o.timestamp) + OpenAPI.validate_property(WorkflowModel, Symbol("project"), o.project) + OpenAPI.validate_property(WorkflowModel, Symbol("metadata"), o.metadata) OpenAPI.validate_property(WorkflowModel, Symbol("compute_node_expiration_buffer_seconds"), o.compute_node_expiration_buffer_seconds) OpenAPI.validate_property(WorkflowModel, Symbol("compute_node_wait_for_new_jobs_seconds"), o.compute_node_wait_for_new_jobs_seconds) OpenAPI.validate_property(WorkflowModel, Symbol("compute_node_ignore_workflow_completion"), o.compute_node_ignore_workflow_completion) @@ -105,4 +113,6 @@ function OpenAPI.validate_property(::Type{ WorkflowModel }, name::Symbol, val) + + end diff --git a/julia_client/julia_client/docs/WorkflowModel.md b/julia_client/julia_client/docs/WorkflowModel.md index 0cf81beb..3507c5f5 100644 --- a/julia_client/julia_client/docs/WorkflowModel.md +++ b/julia_client/julia_client/docs/WorkflowModel.md @@ -9,7 +9,9 @@ Name | Type | Description | Notes **user** | **String** | User that created the workflow | [default to nothing] **description** | **String** | Description of the workflow | [optional] [default to nothing] **timestamp** | **String** | Timestamp of workflow creation | [optional] [default to nothing] -**compute_node_expiration_buffer_seconds** | **Int64** | Inform all compute nodes to shut down this number of seconds before the expiration time. This allows torc to send SIGTERM to all job processes and set all statuses to terminated. Increase the time in cases where the job processes handle SIGTERM and need more time to gracefully shut down. Set the value to 0 to maximize the time given to jobs. If not set, take the database's default value of 90 seconds. | [optional] [default to 60] +**project** | **String** | Project name or identifier for grouping workflows | [optional] [default to nothing] +**metadata** | **String** | Arbitrary metadata as JSON string | [optional] [default to nothing] +**compute_node_expiration_buffer_seconds** | **Int64** | Inform all compute nodes to shut down this number of seconds before the expiration time. This allows torc to send SIGTERM to all job processes and set all statuses to terminated. Increase the time in cases where the job processes handle SIGTERM and need more time to gracefully shut down. Set the value to 0 to maximize the time given to jobs. If not set, take the database's default value of 90 seconds. | [optional] [default to 180] **compute_node_wait_for_new_jobs_seconds** | **Int64** | Inform all compute nodes to wait for new jobs for this time period before exiting. Does not apply if the workflow is complete. Default must be >= completion_check_interval_secs + job_completion_poll_interval to avoid exiting before dependent jobs are unblocked. | [optional] [default to 90] **compute_node_ignore_workflow_completion** | **Bool** | Inform all compute nodes to ignore workflow completions and hold onto allocations indefinitely. Useful for debugging failed jobs and possibly dynamic workflows where jobs get added after starting. | [optional] [default to false] **compute_node_wait_for_healthy_database_minutes** | **Int64** | Inform all compute nodes to wait this number of minutes if the database becomes unresponsive. | [optional] [default to 20] diff --git a/migrations/20260206000000_add_metadata_and_project.down.sql b/migrations/20260206000000_add_metadata_and_project.down.sql new file mode 100644 index 00000000..0d7282bc --- /dev/null +++ b/migrations/20260206000000_add_metadata_and_project.down.sql @@ -0,0 +1,3 @@ +-- Remove project and metadata columns from workflow table +ALTER TABLE workflow DROP COLUMN project; +ALTER TABLE workflow DROP COLUMN metadata; diff --git a/migrations/20260206000000_add_metadata_and_project.up.sql b/migrations/20260206000000_add_metadata_and_project.up.sql new file mode 100644 index 00000000..6e4a5d56 --- /dev/null +++ b/migrations/20260206000000_add_metadata_and_project.up.sql @@ -0,0 +1,5 @@ +-- Add project column to workflow table for grouping/categorization +ALTER TABLE workflow ADD COLUMN project TEXT NULL; + +-- Add metadata column to workflow table for arbitrary JSON metadata +ALTER TABLE workflow ADD COLUMN metadata TEXT NULL; diff --git a/python_client/src/torc/openapi_client/models/workflow_model.py b/python_client/src/torc/openapi_client/models/workflow_model.py index 49b66ed6..ac2ffa45 100644 --- a/python_client/src/torc/openapi_client/models/workflow_model.py +++ b/python_client/src/torc/openapi_client/models/workflow_model.py @@ -32,7 +32,9 @@ class WorkflowModel(BaseModel): user: StrictStr = Field(description="User that created the workflow") description: Optional[StrictStr] = Field(default=None, description="Description of the workflow") timestamp: Optional[StrictStr] = Field(default=None, description="Timestamp of workflow creation") - compute_node_expiration_buffer_seconds: Optional[StrictInt] = Field(default=60, description="Inform all compute nodes to shut down this number of seconds before the expiration time. This allows torc to send SIGTERM to all job processes and set all statuses to terminated. Increase the time in cases where the job processes handle SIGTERM and need more time to gracefully shut down. Set the value to 0 to maximize the time given to jobs. If not set, take the database's default value of 90 seconds.") + project: Optional[StrictStr] = Field(default=None, description="Project name or identifier for grouping workflows") + metadata: Optional[StrictStr] = Field(default=None, description="Arbitrary metadata as JSON string") + compute_node_expiration_buffer_seconds: Optional[StrictInt] = Field(default=180, description="Inform all compute nodes to shut down this number of seconds before the expiration time. This allows torc to send SIGTERM to all job processes and set all statuses to terminated. Increase the time in cases where the job processes handle SIGTERM and need more time to gracefully shut down. Set the value to 0 to maximize the time given to jobs. If not set, take the database's default value of 90 seconds.") compute_node_wait_for_new_jobs_seconds: Optional[StrictInt] = Field(default=90, description="Inform all compute nodes to wait for new jobs for this time period before exiting. Does not apply if the workflow is complete. Default must be >= completion_check_interval_secs + job_completion_poll_interval to avoid exiting before dependent jobs are unblocked.") compute_node_ignore_workflow_completion: Optional[StrictBool] = Field(default=False, description="Inform all compute nodes to ignore workflow completions and hold onto allocations indefinitely. Useful for debugging failed jobs and possibly dynamic workflows where jobs get added after starting.") compute_node_wait_for_healthy_database_minutes: Optional[StrictInt] = Field(default=20, description="Inform all compute nodes to wait this number of minutes if the database becomes unresponsive.") @@ -42,7 +44,7 @@ class WorkflowModel(BaseModel): slurm_defaults: Optional[StrictStr] = Field(default=None, description="Default Slurm parameters to apply to all schedulers as JSON string") use_pending_failed: Optional[StrictBool] = Field(default=False, description="Use PendingFailed status for failed jobs (enables AI-assisted recovery)") status_id: Optional[StrictInt] = None - __properties: ClassVar[List[str]] = ["id", "name", "user", "description", "timestamp", "compute_node_expiration_buffer_seconds", "compute_node_wait_for_new_jobs_seconds", "compute_node_ignore_workflow_completion", "compute_node_wait_for_healthy_database_minutes", "compute_node_min_time_for_new_jobs_seconds", "jobs_sort_method", "resource_monitor_config", "slurm_defaults", "use_pending_failed", "status_id"] + __properties: ClassVar[List[str]] = ["id", "name", "user", "description", "timestamp", "project", "metadata", "compute_node_expiration_buffer_seconds", "compute_node_wait_for_new_jobs_seconds", "compute_node_ignore_workflow_completion", "compute_node_wait_for_healthy_database_minutes", "compute_node_min_time_for_new_jobs_seconds", "jobs_sort_method", "resource_monitor_config", "slurm_defaults", "use_pending_failed", "status_id"] model_config = ConfigDict( populate_by_name=True, @@ -100,7 +102,9 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: "user": obj.get("user"), "description": obj.get("description"), "timestamp": obj.get("timestamp"), - "compute_node_expiration_buffer_seconds": obj.get("compute_node_expiration_buffer_seconds") if obj.get("compute_node_expiration_buffer_seconds") is not None else 60, + "project": obj.get("project"), + "metadata": obj.get("metadata"), + "compute_node_expiration_buffer_seconds": obj.get("compute_node_expiration_buffer_seconds") if obj.get("compute_node_expiration_buffer_seconds") is not None else 180, "compute_node_wait_for_new_jobs_seconds": obj.get("compute_node_wait_for_new_jobs_seconds") if obj.get("compute_node_wait_for_new_jobs_seconds") is not None else 90, "compute_node_ignore_workflow_completion": obj.get("compute_node_ignore_workflow_completion") if obj.get("compute_node_ignore_workflow_completion") is not None else False, "compute_node_wait_for_healthy_database_minutes": obj.get("compute_node_wait_for_healthy_database_minutes") if obj.get("compute_node_wait_for_healthy_database_minutes") is not None else 20, diff --git a/src/client.rs b/src/client.rs index e45c43ec..63e99bd1 100644 --- a/src/client.rs +++ b/src/client.rs @@ -13,6 +13,7 @@ pub mod apis; pub mod async_cli_command; pub mod commands; pub mod errors; +pub mod resource_correction; // Re-export config from the top-level module for backwards compatibility #[cfg(feature = "config")] @@ -50,7 +51,7 @@ pub use workflow_spec::{ // Report model types for inter-command data sharing pub use report_models::{ - FailedJobInfo, JobResultRecord, ResourceUtilizationReport, ResourceViolation, ResultsReport, + JobResultRecord, ResourceUtilizationReport, ResourceViolation, ResultsReport, }; // Version checking utilities diff --git a/src/client/commands/recover.rs b/src/client/commands/recover.rs index 5d098d9e..e0030f55 100644 --- a/src/client/commands/recover.rs +++ b/src/client/commands/recover.rs @@ -14,8 +14,8 @@ use crate::client::apis::configuration::Configuration; use crate::client::apis::default_api; use crate::client::commands::slurm::RegenerateDryRunResult; use crate::client::report_models::{ResourceUtilizationReport, ResultsReport}; +use crate::client::resource_correction::{ResourceAdjustmentReport, apply_resource_corrections}; use crate::models::JobStatus; -use crate::time_utils::duration_string_to_seconds; /// Arguments for workflow recovery pub struct RecoverArgs { @@ -49,36 +49,6 @@ pub struct RecoveryResult { pub slurm_dry_run: Option, } -/// Detailed report of a resource adjustment for JSON output -#[derive(Debug, Clone, Serialize)] -pub struct ResourceAdjustmentReport { - /// The resource_requirements_id being adjusted - pub resource_requirements_id: i64, - /// Job IDs that share this resource requirement - pub job_ids: Vec, - /// Job names for reference - pub job_names: Vec, - /// Whether memory was adjusted - pub memory_adjusted: bool, - /// Original memory setting - #[serde(skip_serializing_if = "Option::is_none")] - pub original_memory: Option, - /// New memory setting - #[serde(skip_serializing_if = "Option::is_none")] - pub new_memory: Option, - /// Maximum peak memory observed (bytes) - #[serde(skip_serializing_if = "Option::is_none")] - pub max_peak_memory_bytes: Option, - /// Whether runtime was adjusted - pub runtime_adjusted: bool, - /// Original runtime setting - #[serde(skip_serializing_if = "Option::is_none")] - pub original_runtime: Option, - /// New runtime setting - #[serde(skip_serializing_if = "Option::is_none")] - pub new_runtime: Option, -} - /// Full recovery report for JSON output #[derive(Debug, Clone, Serialize)] pub struct RecoveryReport { @@ -100,56 +70,6 @@ pub struct SlurmLogInfo { pub slurm_stderr: Option, } -/// Parse memory string (e.g., "8g", "512m", "1024k") to bytes -pub fn parse_memory_bytes(mem: &str) -> Option { - let mem = mem.trim().to_lowercase(); - let (num_str, multiplier) = if mem.ends_with("gb") { - (mem.trim_end_matches("gb"), 1024u64 * 1024 * 1024) - } else if mem.ends_with("g") { - (mem.trim_end_matches("g"), 1024u64 * 1024 * 1024) - } else if mem.ends_with("mb") { - (mem.trim_end_matches("mb"), 1024u64 * 1024) - } else if mem.ends_with("m") { - (mem.trim_end_matches("m"), 1024u64 * 1024) - } else if mem.ends_with("kb") { - (mem.trim_end_matches("kb"), 1024u64) - } else if mem.ends_with("k") { - (mem.trim_end_matches("k"), 1024u64) - } else { - (mem.as_str(), 1u64) - }; - num_str - .parse::() - .ok() - .map(|n| (n * multiplier as f64) as u64) -} - -/// Format bytes to memory string (e.g., "12g", "512m") -pub fn format_memory_bytes_short(bytes: u64) -> String { - if bytes >= 1024 * 1024 * 1024 { - format!("{}g", bytes / (1024 * 1024 * 1024)) - } else if bytes >= 1024 * 1024 { - format!("{}m", bytes / (1024 * 1024)) - } else if bytes >= 1024 { - format!("{}k", bytes / 1024) - } else { - format!("{}b", bytes) - } -} - -/// Format seconds to ISO8601 duration (e.g., "PT2H30M") -pub fn format_duration_iso8601(secs: u64) -> String { - let hours = secs / 3600; - let mins = (secs % 3600) / 60; - if hours > 0 && mins > 0 { - format!("PT{}H{}M", hours, mins) - } else if hours > 0 { - format!("PT{}H", hours) - } else { - format!("PT{}M", mins.max(1)) - } -} - /// Recover a Slurm workflow by: /// 1. Cleaning up orphaned jobs (from terminated Slurm allocations) /// 2. Checking preconditions (workflow complete, no active workers) @@ -790,48 +710,23 @@ fn correlate_slurm_logs( } } - // Filter to only failed jobs + // Filter to only resource violations let mut failed_log_map = HashMap::new(); - for failed_job in &diagnosis.failed_jobs { - if let Some(log_info) = log_map.remove(&failed_job.job_id) { - failed_log_map.insert(failed_job.job_id, log_info); + for violation in &diagnosis.resource_violations { + if let Some(log_info) = log_map.remove(&violation.job_id) { + failed_log_map.insert(violation.job_id, log_info); } } failed_log_map } -/// Aggregated resource adjustment data for a single resource_requirements_id. -/// When multiple jobs share the same resource requirements, we take the maximum -/// peak memory and runtime to ensure all jobs can succeed on retry. -#[derive(Debug)] -struct ResourceAdjustment { - /// The resource_requirements_id - rr_id: i64, - /// Job IDs that share this resource requirement and need retry - job_ids: Vec, - /// Job names for logging - job_names: Vec, - /// Maximum peak memory observed across all OOM jobs (in bytes) - max_peak_memory_bytes: Option, - /// Whether any job had OOM without peak data (fall back to multiplier) - has_oom_without_peak: bool, - /// Whether any job had a timeout - has_timeout: bool, - /// Current memory setting (for fallback calculation) - current_memory: String, - /// Current runtime setting - current_runtime: String, -} - /// Apply recovery heuristics and update job resources /// /// If `dry_run` is true, shows what would be done without making changes. /// -/// When multiple jobs share the same `resource_requirements_id`, this function -/// finds the maximum peak memory across all OOM jobs in that group and applies -/// that (with multiplier) to the shared resource requirement. This ensures all -/// jobs in the group can succeed on retry. +/// This function combines recovery-specific logic (Slurm logs, retry_unknown handling) +/// with the shared resource correction algorithm. #[allow(clippy::too_many_arguments)] pub fn apply_recovery_heuristics( config: &Configuration, @@ -843,12 +738,7 @@ pub fn apply_recovery_heuristics( output_dir: &Path, dry_run: bool, ) -> Result { - let mut oom_fixed = 0; - let mut timeout_fixed = 0; - let mut other_failures = 0; - let mut jobs_to_retry = Vec::new(); - - // Try to get Slurm log info for correlation + // Try to get Slurm log info for correlation and logging let slurm_log_map = match get_slurm_log_info(workflow_id, output_dir) { Ok(slurm_info) => { let log_map = correlate_slurm_logs(diagnosis, &slurm_info); @@ -863,262 +753,60 @@ pub fn apply_recovery_heuristics( } }; - // Phase 1: Collect and aggregate data by resource_requirements_id - // This ensures that when multiple jobs share the same RR, we use the - // maximum peak memory across all of them. - let mut rr_adjustments: HashMap = HashMap::new(); - - for job_info in &diagnosis.failed_jobs { - let job_id = job_info.job_id; - let likely_oom = job_info.likely_oom; - let likely_timeout = job_info.likely_timeout; - - // Log Slurm info if available - if let Some(slurm_info) = slurm_log_map.get(&job_id) + // Log Slurm info for each resource violation if available + for violation in &diagnosis.resource_violations { + if let Some(slurm_info) = slurm_log_map.get(&violation.job_id) && let Some(slurm_job_id) = &slurm_info.slurm_job_id { - debug!(" Job {} ran in Slurm allocation {}", job_id, slurm_job_id); + debug!( + " Job {} ran in Slurm allocation {}", + violation.job_id, slurm_job_id + ); } + } + + // Count other failures for recovery report + let mut other_failures = 0; + let mut unknown_job_ids = Vec::new(); - // Handle unknown failures (no OOM or timeout detected) - if !likely_oom && !likely_timeout { + for violation in &diagnosis.resource_violations { + if !violation.likely_oom && !violation.likely_timeout { other_failures += 1; if retry_unknown { - jobs_to_retry.push(job_id); + unknown_job_ids.push(violation.job_id); } - continue; - } - - // Get current job to find resource requirements - let job = match default_api::get_job(config, job_id) { - Ok(j) => j, - Err(e) => { - warn!(" Warning: couldn't get job {}: {}", job_id, e); - continue; - } - }; - - let rr_id = match job.resource_requirements_id { - Some(id) => id, - None => { - warn!(" Warning: job {} has no resource requirements", job_id); - other_failures += 1; - continue; - } - }; - - // Get or create the adjustment entry for this resource_requirements_id - let adjustment = rr_adjustments.entry(rr_id).or_insert_with(|| { - // Fetch current resource requirements (only once per rr_id) - let (current_memory, current_runtime) = - match default_api::get_resource_requirements(config, rr_id) { - Ok(rr) => (rr.memory, rr.runtime), - Err(e) => { - warn!( - " Warning: couldn't get resource requirements {}: {}", - rr_id, e - ); - (String::new(), String::new()) - } - }; - ResourceAdjustment { - rr_id, - job_ids: Vec::new(), - job_names: Vec::new(), - max_peak_memory_bytes: None, - has_oom_without_peak: false, - has_timeout: false, - current_memory, - current_runtime, - } - }); - - // Skip if we couldn't fetch the resource requirements - if adjustment.current_memory.is_empty() { - continue; - } - - adjustment.job_ids.push(job_id); - adjustment.job_names.push(job.name.clone()); - - // Track OOM data - if likely_oom { - let peak_bytes = job_info - .peak_memory_bytes - .filter(|&v| v > 0) - .map(|v| v as u64); - - if let Some(peak) = peak_bytes { - // Update max if this job used more memory - adjustment.max_peak_memory_bytes = Some( - adjustment - .max_peak_memory_bytes - .map_or(peak, |current_max| current_max.max(peak)), - ); - } else { - adjustment.has_oom_without_peak = true; - } - } - - // Track timeout - if likely_timeout { - adjustment.has_timeout = true; } } - // Phase 2: Apply adjustments once per resource_requirements_id - let mut adjustment_reports = Vec::new(); - - for adjustment in rr_adjustments.values() { - let rr_id = adjustment.rr_id; - let mut updated = false; - let mut memory_adjusted = false; - let mut runtime_adjusted = false; - let mut original_memory = None; - let mut new_memory_str = None; - let mut original_runtime = None; - let mut new_runtime_str = None; - - // Fetch current resource requirements for update - let rr = match default_api::get_resource_requirements(config, rr_id) { - Ok(r) => r, - Err(e) => { - warn!( - " Warning: couldn't get resource requirements {}: {}", - rr_id, e - ); - continue; - } - }; - let mut new_rr = rr.clone(); - - // Apply OOM fix using maximum peak memory across all jobs sharing this RR - if adjustment.max_peak_memory_bytes.is_some() || adjustment.has_oom_without_peak { - let new_bytes = if let Some(max_peak) = adjustment.max_peak_memory_bytes { - // Use the maximum observed peak memory * multiplier - (max_peak as f64 * memory_multiplier) as u64 - } else if let Some(current_bytes) = parse_memory_bytes(&adjustment.current_memory) { - // Fall back to current specified * multiplier - (current_bytes as f64 * memory_multiplier) as u64 - } else { - warn!( - " RR {}: OOM detected but couldn't determine new memory", - rr_id - ); - continue; - }; - - let new_memory = format_memory_bytes_short(new_bytes); - let job_count = adjustment.job_ids.len(); - - if let Some(max_peak) = adjustment.max_peak_memory_bytes { - if job_count > 1 { - info!( - " {} job(s) with RR {}: OOM detected, max peak usage {} -> allocating {} ({}x)", - job_count, - rr_id, - format_memory_bytes_short(max_peak), - new_memory, - memory_multiplier - ); - debug!(" Jobs: {:?}", adjustment.job_names); - } else if let (Some(job_id), Some(job_name)) = - (adjustment.job_ids.first(), adjustment.job_names.first()) - { - info!( - " Job {} ({}): OOM detected, peak usage {} -> allocating {} ({}x)", - job_id, - job_name, - format_memory_bytes_short(max_peak), - new_memory, - memory_multiplier - ); - } - } else { - info!( - " {} job(s) with RR {}: OOM detected, increasing memory {} -> {} ({}x, no peak data)", - job_count, rr_id, adjustment.current_memory, new_memory, memory_multiplier - ); - } - - // Track for JSON report - original_memory = Some(adjustment.current_memory.clone()); - new_memory_str = Some(new_memory.clone()); - memory_adjusted = true; - - new_rr.memory = new_memory; - updated = true; - oom_fixed += adjustment.job_ids.len(); - } - - // Apply timeout fix - if adjustment.has_timeout - && let Ok(current_secs) = duration_string_to_seconds(&adjustment.current_runtime) - { - let new_secs = (current_secs as f64 * runtime_multiplier) as u64; - let new_runtime = format_duration_iso8601(new_secs); - let job_count = adjustment.job_ids.len(); - - if job_count > 1 { - info!( - " {} job(s) with RR {}: Timeout detected, increasing runtime {} -> {}", - job_count, rr_id, adjustment.current_runtime, new_runtime - ); - } else if let (Some(job_id), Some(job_name)) = - (adjustment.job_ids.first(), adjustment.job_names.first()) - { - info!( - " Job {} ({}): Timeout detected, increasing runtime {} -> {}", - job_id, job_name, adjustment.current_runtime, new_runtime - ); - } - - // Track for JSON report - original_runtime = Some(adjustment.current_runtime.clone()); - new_runtime_str = Some(new_runtime.clone()); - runtime_adjusted = true; + // Call shared resource correction algorithm + let correction_result = apply_resource_corrections( + config, + workflow_id, + diagnosis, + memory_multiplier, + runtime_multiplier, + &[], // include_jobs empty means all jobs + dry_run, + )?; - new_rr.runtime = new_runtime; - updated = true; - timeout_fixed += adjustment.job_ids.len(); - } + // Extract counts from shared result + let oom_fixed = correction_result.memory_corrections; + let timeout_fixed = correction_result.runtime_corrections; - // Update resource requirements if changed (only once per rr_id) - if updated { - if !dry_run - && let Err(e) = default_api::update_resource_requirements(config, rr_id, new_rr) - { - warn!( - " Warning: failed to update resource requirements {}: {}", - rr_id, e - ); - } - // All jobs sharing this RR should be retried - jobs_to_retry.extend(&adjustment.job_ids); - - // Create adjustment report for JSON output - adjustment_reports.push(ResourceAdjustmentReport { - resource_requirements_id: rr_id, - job_ids: adjustment.job_ids.clone(), - job_names: adjustment.job_names.clone(), - memory_adjusted, - original_memory, - new_memory: new_memory_str, - max_peak_memory_bytes: adjustment.max_peak_memory_bytes, - runtime_adjusted, - original_runtime, - new_runtime: new_runtime_str, - }); - } + // Combine jobs that need retry: those with corrected resources + unknown failures + let mut jobs_to_retry = Vec::new(); + for adj in &correction_result.adjustments { + jobs_to_retry.extend(&adj.job_ids); } + jobs_to_retry.extend(&unknown_job_ids); Ok(RecoveryResult { oom_fixed, timeout_fixed, - unknown_retried: 0, // Will be set in recover_workflow if retry_unknown is true + unknown_retried: unknown_job_ids.len(), other_failures, jobs_to_retry, - adjustments: adjustment_reports, + adjustments: correction_result.adjustments, slurm_dry_run: None, // Set in recover_workflow dry_run block }) } @@ -1310,38 +998,3 @@ fn get_scheduler_dry_run( serde_json::from_str(&stdout) .map_err(|e| format!("Failed to parse slurm regenerate dry-run output: {}", e)) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_memory_bytes() { - assert_eq!(parse_memory_bytes("1g"), Some(1024 * 1024 * 1024)); - assert_eq!(parse_memory_bytes("2gb"), Some(2 * 1024 * 1024 * 1024)); - assert_eq!(parse_memory_bytes("512m"), Some(512 * 1024 * 1024)); - assert_eq!(parse_memory_bytes("256mb"), Some(256 * 1024 * 1024)); - assert_eq!(parse_memory_bytes("1024k"), Some(1024 * 1024)); - assert_eq!(parse_memory_bytes("1024kb"), Some(1024 * 1024)); - assert_eq!(parse_memory_bytes("1024"), Some(1024)); - } - - #[test] - fn test_format_memory_bytes_short() { - assert_eq!(format_memory_bytes_short(1024 * 1024 * 1024), "1g"); - assert_eq!(format_memory_bytes_short(2 * 1024 * 1024 * 1024), "2g"); - assert_eq!(format_memory_bytes_short(512 * 1024 * 1024), "512m"); - assert_eq!(format_memory_bytes_short(1024 * 1024), "1m"); - assert_eq!(format_memory_bytes_short(1024), "1k"); - } - - #[test] - fn test_format_duration_iso8601() { - assert_eq!(format_duration_iso8601(3600), "PT1H"); - assert_eq!(format_duration_iso8601(7200), "PT2H"); - assert_eq!(format_duration_iso8601(5400), "PT1H30M"); - assert_eq!(format_duration_iso8601(1800), "PT30M"); - assert_eq!(format_duration_iso8601(60), "PT1M"); - assert_eq!(format_duration_iso8601(30), "PT1M"); // Minimum 1 minute - } -} diff --git a/src/client/commands/reports.rs b/src/client/commands/reports.rs index 71407c9c..17194f4c 100644 --- a/src/client/commands/reports.rs +++ b/src/client/commands/reports.rs @@ -10,7 +10,8 @@ use crate::client::log_paths::{ get_slurm_job_runner_log_file, get_slurm_stderr_path, get_slurm_stdout_path, }; use crate::client::report_models::{ - FailedJobInfo, JobResultRecord, ResourceUtilizationReport, ResourceViolation, ResultsReport, + JobResultRecord, ResourceUtilizationReport, ResourceViolation, ResourceViolationInfo, + ResultsReport, }; use crate::models; use crate::time_utils::duration_string_to_seconds; @@ -92,6 +93,9 @@ EXAMPLES: /// Include failed and terminated jobs in the analysis (for recovery diagnostics) #[arg(long)] include_failed: bool, + /// Minimum over-utilization percentage to flag as violation (default: 1.0%) + #[arg(long, default_value = "1.0")] + min_over_utilization: f64, }, /// Generate a comprehensive JSON report of job results including all log file paths #[command(after_long_help = "\ @@ -135,6 +139,7 @@ pub fn handle_report_commands(config: &Configuration, command: &ReportCommands, run_id, all, include_failed, + min_over_utilization, } => { check_resource_utilization( config, @@ -142,6 +147,7 @@ pub fn handle_report_commands(config: &Configuration, command: &ReportCommands, *run_id, *all, *include_failed, + *min_over_utilization, format, ); } @@ -165,6 +171,7 @@ fn check_resource_utilization( run_id: Option, show_all: bool, include_failed: bool, + min_over_utilization: f64, format: &str, ) { // Get or select workflow ID @@ -282,7 +289,7 @@ fn check_resource_utilization( // Analyze each result let mut rows = Vec::new(); let mut over_util_count = 0; - let mut failed_jobs_info: Vec = Vec::new(); + let mut resource_violations_info: Vec = Vec::new(); for result in &results { let job_id = result.job_id; @@ -329,6 +336,9 @@ fn check_resource_utilization( let mut likely_timeout = false; let mut timeout_reason: Option = None; let mut runtime_utilization: Option = None; + let mut likely_cpu_violation = false; + let mut peak_cpu_percent_val: Option = None; + let mut likely_runtime_violation = false; let peak_memory_bytes = result.peak_memory_bytes; let peak_memory_formatted = peak_memory_bytes.map(format_memory_bytes); @@ -365,6 +375,11 @@ fn check_resource_utilization( likely_timeout = true; timeout_reason = Some("sigxcpu_152".to_string()); } + + // Check for runtime violation: job ran longer than allocated time + if exec_time_seconds > specified_runtime_seconds { + likely_runtime_violation = true; + } } // Check for OOM via return code 137 (128 + SIGKILL) @@ -392,7 +407,18 @@ fn check_resource_utilization( } } - failed_jobs_info.push(FailedJobInfo { + // Check CPU over-utilization + if let Some(peak_cpu_percent) = result.peak_cpu_percent { + peak_cpu_percent_val = Some(peak_cpu_percent); + let num_cpus = resource_req.num_cpus; + let specified_cpu_percent = 100.0 * num_cpus as f64; // 100% per CPU + + if peak_cpu_percent > specified_cpu_percent { + likely_cpu_violation = true; + } + } + + resource_violations_info.push(ResourceViolationInfo { job_id, job_name: job_name.clone(), return_code: result.return_code, @@ -408,6 +434,9 @@ fn check_resource_utilization( likely_timeout, timeout_reason, runtime_utilization, + likely_cpu_violation, + peak_cpu_percent: peak_cpu_percent_val, + likely_runtime_violation, }); } @@ -415,17 +444,19 @@ fn check_resource_utilization( if let Some(peak_memory_bytes) = result.peak_memory_bytes { let specified_memory_bytes = parse_memory_string(&resource_req.memory); if peak_memory_bytes > specified_memory_bytes { - over_util_count += 1; let over_pct = ((peak_memory_bytes as f64 / specified_memory_bytes as f64) - 1.0) * 100.0; - rows.push(ResourceUtilizationRow { - job_id, - job_name: job_name.clone(), - resource_type: "Memory".to_string(), - specified: format_memory_bytes(specified_memory_bytes), - peak_used: format_memory_bytes(peak_memory_bytes), - over_utilization: format!("+{:.1}%", over_pct), - }); + if over_pct >= min_over_utilization { + over_util_count += 1; + rows.push(ResourceUtilizationRow { + job_id, + job_name: job_name.clone(), + resource_type: "Memory".to_string(), + specified: format_memory_bytes(specified_memory_bytes), + peak_used: format_memory_bytes(peak_memory_bytes), + over_utilization: format!("+{:.1}%", over_pct), + }); + } } else if show_all { let under_pct = (1.0 - (peak_memory_bytes as f64 / specified_memory_bytes as f64)) * 100.0; @@ -447,16 +478,18 @@ fn check_resource_utilization( let specified_cpu_percent = 100.0 * num_cpus as f64; // 100% per CPU if peak_cpu_percent > specified_cpu_percent { - over_util_count += 1; let over_pct = ((peak_cpu_percent / specified_cpu_percent) - 1.0) * 100.0; - rows.push(ResourceUtilizationRow { - job_id, - job_name: job_name.clone(), - resource_type: "CPU".to_string(), - specified: format!("{:.0}% ({} cores)", specified_cpu_percent, num_cpus), - peak_used: format!("{:.1}%", peak_cpu_percent), - over_utilization: format!("+{:.1}%", over_pct), - }); + if over_pct >= min_over_utilization { + over_util_count += 1; + rows.push(ResourceUtilizationRow { + job_id, + job_name: job_name.clone(), + resource_type: "CPU".to_string(), + specified: format!("{:.0}% ({} cores)", specified_cpu_percent, num_cpus), + peak_used: format!("{:.1}%", peak_cpu_percent), + over_utilization: format!("+{:.1}%", over_pct), + }); + } } else if show_all { let under_pct = (1.0 - (peak_cpu_percent / specified_cpu_percent)) * 100.0; rows.push(ResourceUtilizationRow { @@ -481,16 +514,18 @@ fn check_resource_utilization( }; if exec_time_seconds > specified_runtime_seconds { - over_util_count += 1; let over_pct = ((exec_time_seconds / specified_runtime_seconds) - 1.0) * 100.0; - rows.push(ResourceUtilizationRow { - job_id, - job_name: job_name.clone(), - resource_type: "Runtime".to_string(), - specified: format_duration(specified_runtime_seconds), - peak_used: format_duration(exec_time_seconds), - over_utilization: format!("+{:.1}%", over_pct), - }); + if over_pct >= min_over_utilization { + over_util_count += 1; + rows.push(ResourceUtilizationRow { + job_id, + job_name: job_name.clone(), + resource_type: "Runtime".to_string(), + specified: format_duration(specified_runtime_seconds), + peak_used: format_duration(exec_time_seconds), + over_utilization: format!("+{:.1}%", over_pct), + }); + } } else if show_all { let under_pct = (1.0 - (exec_time_seconds / specified_runtime_seconds)) * 100.0; rows.push(ResourceUtilizationRow { @@ -523,8 +558,8 @@ fn check_resource_utilization( over_utilization: r.over_utilization.clone(), }) .collect(), - failed_jobs_count: failed_jobs_info.len(), - failed_jobs: failed_jobs_info, + resource_violations_count: resource_violations_info.len(), + resource_violations: resource_violations_info, }; print_json(&report, "resource utilization"); @@ -551,6 +586,12 @@ fn check_resource_utilization( } display_table_with_count(&rows, "violations"); + // Print command to run correct-resources + eprintln!( + "\nTo automatically correct these violations, run:\n torc workflows correct-resources {}", + wf_id + ); + if !show_all { println!( "\nNote: Use --all to see all jobs, including those that stayed within limits" diff --git a/src/client/commands/watch.rs b/src/client/commands/watch.rs index 72a82ece..41b38dc5 100644 --- a/src/client/commands/watch.rs +++ b/src/client/commands/watch.rs @@ -781,8 +781,8 @@ pub fn run_watch(config: &Configuration, args: &WatchArgs) { total_results: 0, over_utilization_count: 0, violations: Vec::new(), - failed_jobs_count: 0, - failed_jobs: Vec::new(), + resource_violations_count: 0, + resource_violations: Vec::new(), } } }; diff --git a/src/client/commands/workflows.rs b/src/client/commands/workflows.rs index 0017151d..231e1fdc 100644 --- a/src/client/commands/workflows.rs +++ b/src/client/commands/workflows.rs @@ -60,15 +60,17 @@ use crate::client::commands::workflow_export::{ EXPORT_VERSION, ExportImportStats, IdMappings, WorkflowExport, }; use crate::client::commands::{ - get_env_user_name, print_error, select_workflow_interactively, + get_env_user_name, output::print_json_wrapped, print_error, select_workflow_interactively, table_format::display_table_with_count, }; use crate::client::hpc::hpc_interface::HpcInterface; use crate::client::workflow_manager::WorkflowManager; use crate::client::workflow_spec::WorkflowSpec; use crate::config::TorcConfig; +use crate::memory_utils::memory_string_to_bytes; use crate::models; use crate::models::JobStatus; +use crate::time_utils::duration_string_to_seconds; use serde_json; use tabled::Tabled; @@ -80,6 +82,10 @@ struct WorkflowTableRowNoUser { name: String, #[tabled(rename = "Description")] description: String, + #[tabled(rename = "Project")] + project: String, + #[tabled(rename = "Metadata")] + metadata: String, #[tabled(rename = "Timestamp")] timestamp: String, } @@ -94,6 +100,10 @@ struct WorkflowTableRow { name: String, #[tabled(rename = "Description")] description: String, + #[tabled(rename = "Project")] + project: String, + #[tabled(rename = "Metadata")] + metadata: String, #[tabled(rename = "Timestamp")] timestamp: String, } @@ -317,8 +327,7 @@ EXAMPLES: /// Update an existing workflow #[command( hide = true, - after_long_help = "\ -EXAMPLES: + after_long_help = r#"EXAMPLES: # Update workflow name torc workflows update 123 --name 'New Name' @@ -327,7 +336,13 @@ EXAMPLES: # Transfer ownership torc workflows update 123 --owner-user newuser -" + + # Update project + torc workflows update 123 --project my-project + + # Update metadata (pass JSON as string; use single quotes in shell) + torc workflows update 123 --metadata '{"key":"value","stage":"production"}' +"# )] Update { /// ID of the workflow to update (optional - will prompt if not provided) @@ -342,6 +357,12 @@ EXAMPLES: /// User that owns the workflow #[arg(long)] owner_user: Option, + /// Project name or identifier + #[arg(long)] + project: Option, + /// Metadata as JSON string + #[arg(long)] + metadata: Option, }, /// Cancel a workflow and all associated Slurm jobs. All state will be preserved and the /// workflow can be resumed after it is reinitialized. @@ -561,6 +582,48 @@ EXAMPLES: #[arg(long)] no_prompts: bool, }, + /// Correct resource requirements based on actual job usage (proactive optimization) + /// + /// Analyzes completed jobs and adjusts resource requirements to better match actual usage. + /// Unlike `torc recover`, this command does NOT reset or rerun jobs - it only updates + /// resource requirements for future runs. + #[command( + name = "correct-resources", + after_long_help = "\ +EXAMPLES: + # Preview corrections (dry-run) + torc workflows correct-resources 123 --dry-run + + # Apply corrections to all over-utilized jobs + torc workflows correct-resources 123 + + # Apply corrections only to specific jobs + torc workflows correct-resources 123 --job-ids 45,67,89 + + # Use custom multipliers + torc workflows correct-resources 123 --memory-multiplier 1.5 --runtime-multiplier 1.4 + + # Output as JSON for programmatic use + torc -f json workflows correct-resources 123 --dry-run +" + )] + CorrectResources { + /// ID of the workflow to analyze (optional - will prompt if not provided) + #[arg()] + workflow_id: Option, + /// Memory multiplier for jobs that exceeded memory (default: 1.2) + #[arg(long, default_value = "1.2")] + memory_multiplier: f64, + /// Runtime multiplier for jobs that exceeded runtime (default: 1.2) + #[arg(long, default_value = "1.2")] + runtime_multiplier: f64, + /// Only correct resource requirements for specific jobs (comma-separated IDs) + #[arg(long, value_delimiter = ',')] + job_ids: Option>, + /// Show what would be changed without applying (default: false) + #[arg(long)] + dry_run: bool, + }, /// Show the execution plan for a workflow specification or existing workflow #[command( hide = true, @@ -738,6 +801,36 @@ EXAMPLES: }, } +/// Parse JSON string fields into objects for better JSON output formatting +/// +/// Converts JSON string fields (resource_monitor_config, slurm_defaults, metadata) +/// from string representations into actual JSON objects in the output. +/// This improves readability in JSON output while keeping them as strings in the database. +fn parse_json_fields(mut json: serde_json::Value) -> serde_json::Value { + // Parse resource_monitor_config if present + if let Some(config_str) = json["resource_monitor_config"].as_str() + && let Ok(config_obj) = serde_json::from_str::(config_str) + { + json["resource_monitor_config"] = config_obj; + } + + // Parse slurm_defaults if present + if let Some(defaults_str) = json["slurm_defaults"].as_str() + && let Ok(defaults_obj) = serde_json::from_str::(defaults_str) + { + json["slurm_defaults"] = defaults_obj; + } + + // Parse metadata if present + if let Some(metadata_str) = json["metadata"].as_str() + && let Ok(metadata_obj) = serde_json::from_str::(metadata_str) + { + json["metadata"] = metadata_obj; + } + + json +} + fn show_execution_plan_from_spec(file_path: &str, format: &str) { // Parse the workflow spec let mut spec = match WorkflowSpec::from_spec_file(file_path) { @@ -1068,6 +1161,378 @@ fn handle_list_actions( } } +/// Context for looking up jobs and their resource requirements +/// +/// This struct centralizes job and resource requirement lookups to reduce +/// repeated nested if-let chains throughout violation detection. +struct ResourceLookupContext<'a> { + jobs: &'a [models::JobModel], + resource_requirements: &'a [models::ResourceRequirementsModel], +} + +impl<'a> ResourceLookupContext<'a> { + fn new( + jobs: &'a [models::JobModel], + resource_requirements: &'a [models::ResourceRequirementsModel], + ) -> Self { + Self { + jobs, + resource_requirements, + } + } + + fn find_job(&self, job_id: i64) -> Option<&models::JobModel> { + self.jobs.iter().find(|j| j.id == Some(job_id)) + } + + fn find_resource_requirements(&self, rr_id: i64) -> Option<&models::ResourceRequirementsModel> { + self.resource_requirements + .iter() + .find(|r| r.id == Some(rr_id)) + } +} + +/// Detect memory violation based on actual peak memory usage +fn detect_memory_violation( + ctx: &ResourceLookupContext, + result: &models::ResultModel, + job: &models::JobModel, +) -> bool { + if let Some(peak_mem) = result.peak_memory_bytes + && let Some(rr_id) = job.resource_requirements_id + && let Some(rr) = ctx.find_resource_requirements(rr_id) + && let Ok(specified_memory_bytes) = memory_string_to_bytes(&rr.memory) + { + peak_mem > specified_memory_bytes + } else { + false + } +} + +/// Detect CPU violation based on actual peak CPU percentage +fn detect_cpu_violation( + ctx: &ResourceLookupContext, + result: &models::ResultModel, + job: &models::JobModel, +) -> bool { + if let Some(peak_cpu) = result.peak_cpu_percent + && let Some(rr_id) = job.resource_requirements_id + && let Some(rr) = ctx.find_resource_requirements(rr_id) + { + let configured_cpus = rr.num_cpus as f64; + let specified_cpu_percent = configured_cpus * 100.0; + peak_cpu > specified_cpu_percent + } else { + false + } +} + +/// Detect runtime violation based on actual execution time +fn detect_runtime_violation( + ctx: &ResourceLookupContext, + result: &models::ResultModel, + job: &models::JobModel, +) -> bool { + if let Some(rr_id) = job.resource_requirements_id + && let Some(rr) = ctx.find_resource_requirements(rr_id) + && let Ok(specified_runtime_seconds) = duration_string_to_seconds(&rr.runtime) + { + let exec_time_seconds = result.exec_time_minutes * 60.0; + let specified_runtime_seconds = specified_runtime_seconds as f64; + exec_time_seconds > specified_runtime_seconds + } else { + false + } +} + +/// Detect timeout based on return code +fn detect_timeout(result: &models::ResultModel) -> bool { + result.return_code == 152 +} + +fn handle_correct_resources( + config: &Configuration, + workflow_id: &Option, + memory_multiplier: f64, + runtime_multiplier: f64, + job_ids: &Option>, + dry_run: bool, + format: &str, +) { + use crate::client::report_models::ResourceUtilizationReport; + use crate::client::resource_correction::apply_resource_corrections; + + let user_name = get_env_user_name(); + + let selected_workflow_id = match workflow_id { + Some(id) => *id, + None => select_workflow_interactively(config, &user_name).unwrap(), + }; + + if format != "json" { + if dry_run { + eprintln!( + "Analyzing and correcting resource requirements for workflow {} (dry-run mode)", + selected_workflow_id + ); + } else { + eprintln!( + "Analyzing and correcting resource requirements for workflow {}", + selected_workflow_id + ); + } + } + + // Step 1: Fetch completed and failed results for diagnosis (uses latest run) + let params = ResultListParams::new().with_status(models::JobStatus::Completed); + let completed_results = match paginate_results(config, selected_workflow_id, params) { + Ok(results) => results, + Err(e) => { + if format == "json" { + let error_response = serde_json::json!({ + "status": "error", + "message": format!("Failed to fetch completed results: {}", e), + "workflow_id": selected_workflow_id + }); + println!("{}", serde_json::to_string_pretty(&error_response).unwrap()); + } else { + eprintln!("Error: Failed to fetch completed results: {}", e); + } + std::process::exit(1); + } + }; + + // Fetch failed results to analyze resource issues (uses latest run) + let failed_params = ResultListParams::new().with_status(models::JobStatus::Failed); + let failed_results = + paginate_results(config, selected_workflow_id, failed_params).unwrap_or_default(); + + let mut all_results = completed_results; + all_results.extend(failed_results); + + if all_results.is_empty() { + if format == "json" { + let response = serde_json::json!({ + "status": "success", + "workflow_id": selected_workflow_id, + "resource_requirements_updated": 0, + "jobs_analyzed": 0, + "message": "No completed jobs found" + }); + println!("{}", serde_json::to_string_pretty(&response).unwrap()); + } else { + println!( + "No completed jobs found for workflow {}", + selected_workflow_id + ); + } + return; + } + + // Step 2: Get jobs and resource requirements to build failed_jobs list + let jobs = match paginate_jobs(config, selected_workflow_id, JobListParams::new()) { + Ok(j) => j, + Err(e) => { + if format == "json" { + let error_response = serde_json::json!({ + "status": "error", + "message": format!("Failed to fetch jobs: {}", e), + "workflow_id": selected_workflow_id + }); + println!("{}", serde_json::to_string_pretty(&error_response).unwrap()); + } else { + eprintln!("Error: Failed to fetch jobs: {}", e); + } + std::process::exit(1); + } + }; + + // Fetch resource requirements to check CPU allocations + let resource_requirements = paginate_resource_requirements( + config, + selected_workflow_id, + ResourceRequirementsListParams::new(), + ) + .unwrap_or_default(); + + // Build resource_violations list with violation detection + let ctx = ResourceLookupContext::new(&jobs, &resource_requirements); + let mut resource_violations = Vec::new(); + + for result in &all_results { + if let Some(job) = ctx.find_job(result.job_id) { + let likely_oom = detect_memory_violation(&ctx, result, job); + let likely_timeout = detect_timeout(result); + let likely_cpu_violation = detect_cpu_violation(&ctx, result, job); + let likely_runtime_violation = detect_runtime_violation(&ctx, result, job); + + if likely_oom || likely_timeout || likely_cpu_violation || likely_runtime_violation { + let peak_memory_bytes = result.peak_memory_bytes; + let (configured_cpus, configured_memory, configured_runtime) = + if let Some(rr_id) = job.resource_requirements_id { + if let Some(rr) = ctx.find_resource_requirements(rr_id) { + (rr.num_cpus, rr.memory.clone(), rr.runtime.clone()) + } else { + (0, String::new(), String::new()) + } + } else { + (0, String::new(), String::new()) + }; + + let violation_info = crate::client::report_models::ResourceViolationInfo { + job_id: result.job_id, + job_name: job.name.clone(), + return_code: result.return_code, + exec_time_minutes: result.exec_time_minutes, + configured_memory, + configured_runtime, + configured_cpus, + peak_memory_bytes, + peak_memory_formatted: None, + likely_oom, + oom_reason: if likely_oom { + // Distinguish between actual OOM failure (137) vs memory violation in successful job + if result.return_code == 137 { + Some("sigkill_137".to_string()) + } else { + Some("memory_exceeded".to_string()) + } + } else { + None + }, + memory_over_utilization: None, + likely_timeout, + timeout_reason: if likely_timeout { + Some("sigxcpu_152".to_string()) + } else { + None + }, + runtime_utilization: None, + likely_cpu_violation, + peak_cpu_percent: result.peak_cpu_percent, + likely_runtime_violation, + }; + resource_violations.push(violation_info); + } + } + } + + if resource_violations.is_empty() { + if format == "json" { + let response = serde_json::json!({ + "status": "success", + "workflow_id": selected_workflow_id, + "resource_requirements_updated": 0, + "jobs_analyzed": all_results.len(), + "message": "No jobs with over-utilization detected" + }); + println!("{}", serde_json::to_string_pretty(&response).unwrap()); + } else { + println!("No jobs with over-utilization (OOM/timeout/CPU) detected"); + } + return; + } + + // Create diagnosis report + let diagnosis = ResourceUtilizationReport { + workflow_id: selected_workflow_id, + run_id: None, + total_results: all_results.len(), + over_utilization_count: resource_violations.len(), + violations: Vec::new(), + resource_violations_count: resource_violations.len(), + resource_violations: resource_violations.clone(), + }; + + // Step 3: Apply resource corrections + let include_job_list = job_ids.as_deref().unwrap_or(&[]); + + match apply_resource_corrections( + config, + selected_workflow_id, + &diagnosis, + memory_multiplier, + runtime_multiplier, + include_job_list, + dry_run, + ) { + Ok(result) => { + if format == "json" { + let response = serde_json::json!({ + "status": "success", + "workflow_id": selected_workflow_id, + "dry_run": dry_run, + "memory_multiplier": memory_multiplier, + "runtime_multiplier": runtime_multiplier, + "resource_requirements_updated": result.resource_requirements_updated, + "jobs_analyzed": result.jobs_analyzed, + "memory_corrections": result.memory_corrections, + "runtime_corrections": result.runtime_corrections, + "adjustments": result.adjustments + }); + println!("{}", serde_json::to_string_pretty(&response).unwrap()); + } else { + // Print summary + println!(); + println!("Resource Correction Summary:"); + println!(" Workflow: {}", selected_workflow_id); + println!(" Jobs analyzed: {}", result.jobs_analyzed); + println!( + " Resource requirements updated: {}", + result.resource_requirements_updated + ); + println!(" Memory corrections: {}", result.memory_corrections); + println!(" Runtime corrections: {}", result.runtime_corrections); + + // Print details if any corrections were made + if !result.adjustments.is_empty() { + println!(); + println!("Adjustment Details:"); + for adj in &result.adjustments { + println!( + " RR {}: {} job(s)", + adj.resource_requirements_id, + adj.job_ids.len() + ); + if let (Some(old_mem), Some(new_mem)) = + (&adj.original_memory, &adj.new_memory) + { + println!(" Memory: {} -> {}", old_mem, new_mem); + } + if let (Some(old_rt), Some(new_rt)) = + (&adj.original_runtime, &adj.new_runtime) + { + println!(" Runtime: {} -> {}", old_rt, new_rt); + } + } + } + + if dry_run { + println!(); + println!("(dry-run mode - changes not applied)"); + } else { + println!(); + println!("Resource requirements updated successfully"); + } + } + } + Err(e) => { + if format == "json" { + let error_response = serde_json::json!({ + "status": "error", + "message": format!("Failed to apply resource corrections: {}", e), + "workflow_id": selected_workflow_id + }); + println!("{}", serde_json::to_string_pretty(&error_response).unwrap()); + } else { + eprintln!("Error: Failed to apply resource corrections: {}", e); + } + std::process::exit(1); + } + } +} + +#[allow(clippy::too_many_arguments)] fn handle_cancel(config: &Configuration, workflow_id: &Option, format: &str) { let user_name = get_env_user_name(); @@ -2066,14 +2531,8 @@ fn handle_delete(config: &Configuration, ids: &[i64], no_prompts: bool, format: let json_array: Vec<_> = deleted_workflows .iter() .map(|wf| { - let mut json = serde_json::to_value(wf).unwrap(); - // Parse resource_monitor_config from JSON string to object if present - if let Some(config_str) = &wf.resource_monitor_config - && let Ok(config_obj) = serde_json::from_str::(config_str) - { - json["resource_monitor_config"] = config_obj; - } - json + let json = serde_json::to_value(wf).unwrap(); + parse_json_fields(json) }) .collect(); @@ -2115,12 +2574,18 @@ fn handle_delete(config: &Configuration, ids: &[i64], no_prompts: bool, format: } } +struct WorkflowUpdateFields { + name: Option, + description: Option, + owner_user: Option, + project: Option, + metadata: Option, +} + fn handle_update( config: &Configuration, id: &Option, - name: &Option, - description: &Option, - owner_user: &Option, + updates: &WorkflowUpdateFields, format: &str, ) { let user_name = get_env_user_name(); @@ -2133,29 +2598,28 @@ fn handle_update( match default_api::get_workflow(config, selected_id) { Ok(mut workflow) => { // Update fields that were provided - if let Some(new_name) = name { + if let Some(new_name) = &updates.name { workflow.name = new_name.clone(); } - if description.is_some() { - workflow.description = description.clone(); + if updates.description.is_some() { + workflow.description = updates.description.clone(); } - if let Some(new_user) = owner_user { + if let Some(new_user) = &updates.owner_user { workflow.user = new_user.clone(); } + if updates.project.is_some() { + workflow.project = updates.project.clone(); + } + if updates.metadata.is_some() { + workflow.metadata = updates.metadata.clone(); + } match default_api::update_workflow(config, selected_id, workflow) { Ok(updated_workflow) => { if format == "json" { - // Convert workflow to JSON value, parsing resource_monitor_config if present - let mut json = serde_json::to_value(&updated_workflow).unwrap(); - - // Parse resource_monitor_config from JSON string to object if present - if let Some(config_str) = &updated_workflow.resource_monitor_config - && let Ok(config_obj) = - serde_json::from_str::(config_str) - { - json["resource_monitor_config"] = config_obj; - } + // Convert workflow to JSON value, parsing JSON string fields to objects + let json = serde_json::to_value(&updated_workflow).unwrap(); + let json = parse_json_fields(json); match serde_json::to_string_pretty(&json) { Ok(json_str) => println!("{}", json_str), @@ -2197,22 +2661,8 @@ fn handle_get(config: &Configuration, id: &Option, user: &str, format: &str Ok(workflow) => { if format == "json" { // Convert workflow to JSON value, parsing JSON string fields to objects - let mut json = serde_json::to_value(&workflow).unwrap(); - - // Parse resource_monitor_config from JSON string to object if present - if let Some(config_str) = &workflow.resource_monitor_config - && let Ok(config_obj) = serde_json::from_str::(config_str) - { - json["resource_monitor_config"] = config_obj; - } - - // Parse slurm_defaults from JSON string to object if present - if let Some(defaults_str) = &workflow.slurm_defaults - && let Ok(defaults_obj) = - serde_json::from_str::(defaults_str) - { - json["slurm_defaults"] = defaults_obj; - } + let json = serde_json::to_value(&workflow).unwrap(); + let json = parse_json_fields(json); match serde_json::to_string_pretty(&json) { Ok(json_str) => println!("{}", json_str), @@ -2311,35 +2761,12 @@ fn handle_list( let workflows_json: Vec = workflows .iter() .map(|workflow| { - let mut json = serde_json::to_value(workflow).unwrap(); - - // Parse resource_monitor_config from JSON string to object if present - if let Some(config_str) = &workflow.resource_monitor_config - && let Ok(config_obj) = - serde_json::from_str::(config_str) - { - json["resource_monitor_config"] = config_obj; - } - - // Parse slurm_defaults from JSON string to object if present - if let Some(defaults_str) = &workflow.slurm_defaults - && let Ok(defaults_obj) = - serde_json::from_str::(defaults_str) - { - json["slurm_defaults"] = defaults_obj; - } - - json + let json = serde_json::to_value(workflow).unwrap(); + parse_json_fields(json) }) .collect(); - match serde_json::to_string_pretty(&workflows_json) { - Ok(json) => println!("{}", json), - Err(e) => { - eprintln!("Error serializing workflows to JSON: {}", e); - std::process::exit(1); - } - } + print_json_wrapped("workflows", &workflows_json, "workflows"); } else if workflows.is_empty() { if all_users { println!("No workflows found."); @@ -2355,6 +2782,8 @@ fn handle_list( user: workflow.user.clone(), name: workflow.name.clone(), description: workflow.description.as_deref().unwrap_or("").to_string(), + project: workflow.project.as_deref().unwrap_or("").to_string(), + metadata: workflow.metadata.as_deref().unwrap_or("").to_string(), timestamp: workflow.timestamp.as_deref().unwrap_or("").to_string(), }) .collect(); @@ -2367,6 +2796,8 @@ fn handle_list( id: workflow.id.unwrap_or(-1), name: workflow.name.clone(), description: workflow.description.as_deref().unwrap_or("").to_string(), + project: workflow.project.as_deref().unwrap_or("").to_string(), + metadata: workflow.metadata.as_deref().unwrap_or("").to_string(), timestamp: workflow.timestamp.as_deref().unwrap_or("").to_string(), }) .collect(); @@ -2393,15 +2824,9 @@ fn handle_new( match default_api::create_workflow(config, workflow) { Ok(created_workflow) => { if format == "json" { - // Convert workflow to JSON value, parsing resource_monitor_config if present - let mut json = serde_json::to_value(&created_workflow).unwrap(); - - // Parse resource_monitor_config from JSON string to object if present - if let Some(config_str) = &created_workflow.resource_monitor_config - && let Ok(config_obj) = serde_json::from_str::(config_str) - { - json["resource_monitor_config"] = config_obj; - } + // Convert workflow to JSON value, parsing JSON string fields to objects + let json = serde_json::to_value(&created_workflow).unwrap(); + let json = parse_json_fields(json); match serde_json::to_string_pretty(&json) { Ok(json_str) => println!("{}", json_str), @@ -2979,8 +3404,17 @@ pub fn handle_workflow_commands(config: &Configuration, command: &WorkflowComman name, description, owner_user, + project, + metadata, } => { - handle_update(config, id, name, description, owner_user, format); + let updates = WorkflowUpdateFields { + name: name.clone(), + description: description.clone(), + owner_user: owner_user.clone(), + project: project.clone(), + metadata: metadata.clone(), + }; + handle_update(config, id, &updates, format); } WorkflowCommands::Delete { ids, no_prompts } => { handle_delete(config, ids, *no_prompts, format); @@ -3043,6 +3477,23 @@ pub fn handle_workflow_commands(config: &Configuration, command: &WorkflowComman format, ); } + WorkflowCommands::CorrectResources { + workflow_id, + memory_multiplier, + runtime_multiplier, + job_ids, + dry_run, + } => { + handle_correct_resources( + config, + workflow_id, + *memory_multiplier, + *runtime_multiplier, + job_ids, + *dry_run, + format, + ); + } WorkflowCommands::Cancel { workflow_id } => { handle_cancel(config, workflow_id, format); } diff --git a/src/client/job_runner.rs b/src/client/job_runner.rs index 975d122d..1258ced9 100644 --- a/src/client/job_runner.rs +++ b/src/client/job_runner.rs @@ -73,7 +73,7 @@ use std::time::{Duration, Instant}; use crate::client::apis::configuration::Configuration; use crate::client::apis::default_api; use crate::client::async_cli_command::AsyncCliCommand; -use crate::client::commands::recover::format_duration_iso8601; +use crate::client::resource_correction::format_duration_iso8601; use crate::client::resource_monitor::{ResourceMonitor, ResourceMonitorConfig}; use crate::client::utils; use crate::config::TorcConfig; diff --git a/src/client/report_models.rs b/src/client/report_models.rs index 0916be96..d27f4ea0 100644 --- a/src/client/report_models.rs +++ b/src/client/report_models.rs @@ -15,12 +15,12 @@ pub struct ResourceUtilizationReport { pub total_results: usize, pub over_utilization_count: usize, pub violations: Vec, - /// Number of failed jobs (only present when `--include-failed` is used) + /// Number of jobs with resource violations (only present when `--include-failed` is used) #[serde(default, skip_serializing_if = "is_zero")] - pub failed_jobs_count: usize, - /// Failed job information (only present when `--include-failed` is used) + pub resource_violations_count: usize, + /// Jobs that exceeded resource allocations (only present when `--include-failed` is used) #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub failed_jobs: Vec, + pub resource_violations: Vec, } fn is_zero(n: &usize) -> bool { @@ -38,9 +38,13 @@ pub struct ResourceViolation { pub over_utilization: String, } -/// Information about a failed job for recovery diagnostics +/// Information about a job that exceeded resource allocation. +/// +/// This includes jobs that failed during execution and completed jobs +/// that exceeded their configured memory, CPU, or runtime limits. +/// Used for proactive resource optimization and recovery diagnostics. #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FailedJobInfo { +pub struct ResourceViolationInfo { pub job_id: i64, pub job_name: String, pub return_code: i64, @@ -80,6 +84,18 @@ pub struct FailedJobInfo { /// Runtime utilization percentage (e.g., "95.2%") #[serde(default, skip_serializing_if = "Option::is_none")] pub runtime_utilization: Option, + + /// Whether this job exceeded its CPU allocation + #[serde(default, skip_serializing_if = "is_false")] + pub likely_cpu_violation: bool, + + /// Peak CPU percentage used (e.g., 501.4%) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub peak_cpu_percent: Option, + + /// Whether this job exceeded its runtime allocation + #[serde(default, skip_serializing_if = "is_false")] + pub likely_runtime_violation: bool, } fn is_false(b: &bool) -> bool { @@ -160,8 +176,8 @@ mod tests { peak_used: "1.5 GB".to_string(), over_utilization: "+50.0%".to_string(), }], - failed_jobs_count: 1, - failed_jobs: vec![FailedJobInfo { + resource_violations_count: 1, + resource_violations: vec![ResourceViolationInfo { job_id: 2, job_name: "failed_job".to_string(), return_code: 137i64, @@ -177,6 +193,9 @@ mod tests { likely_timeout: false, timeout_reason: None, runtime_utilization: Some("9.2%".to_string()), + likely_cpu_violation: false, + peak_cpu_percent: None, + likely_runtime_violation: false, }], }; @@ -184,8 +203,8 @@ mod tests { let parsed: ResourceUtilizationReport = serde_json::from_str(&json).unwrap(); assert_eq!(parsed.workflow_id, 1); - assert_eq!(parsed.failed_jobs.len(), 1); - assert!(parsed.failed_jobs[0].likely_oom); + assert_eq!(parsed.resource_violations.len(), 1); + assert!(parsed.resource_violations[0].likely_oom); } #[test] @@ -224,7 +243,7 @@ mod tests { } #[test] - fn test_failed_job_info_optional_fields() { + fn test_resource_violation_info_optional_fields() { // Test that optional fields can be omitted // Note: return_code and configured_cpus are i64 in the struct let json = r#"{ @@ -237,10 +256,11 @@ mod tests { "configured_cpus": 2 }"#; - let parsed: FailedJobInfo = serde_json::from_str(json).unwrap(); + let parsed: ResourceViolationInfo = serde_json::from_str(json).unwrap(); assert_eq!(parsed.job_id, 1); assert!(!parsed.likely_oom); assert!(!parsed.likely_timeout); + assert!(!parsed.likely_runtime_violation); assert!(parsed.peak_memory_bytes.is_none()); } } diff --git a/src/client/resource_correction.rs b/src/client/resource_correction.rs new file mode 100644 index 00000000..d5b56649 --- /dev/null +++ b/src/client/resource_correction.rs @@ -0,0 +1,622 @@ +//! Shared resource correction logic for both recovery and proactive optimization. +//! +//! This module provides the core algorithms for analyzing resource utilization +//! and automatically adjusting resource requirements based on actual job usage. +//! It's used by both `torc recover` (reactive) and `torc workflows correct-resources` (proactive). + +use std::collections::HashMap; + +use log::{debug, info, warn}; +use serde::Serialize; + +use crate::client::apis::configuration::Configuration; +use crate::client::apis::default_api; +use crate::client::report_models::ResourceUtilizationReport; +use crate::memory_utils::memory_string_to_bytes; +use crate::time_utils::duration_string_to_seconds; + +/// Result of applying resource corrections +#[derive(Debug, Clone, Serialize)] +pub struct ResourceCorrectionResult { + pub resource_requirements_updated: usize, + pub jobs_analyzed: usize, + pub memory_corrections: usize, + pub runtime_corrections: usize, + /// Detailed adjustment reports for JSON output + #[serde(skip_serializing_if = "Vec::is_empty")] + pub adjustments: Vec, +} + +/// Detailed report of a resource adjustment for JSON output +#[derive(Debug, Clone, Serialize)] +pub struct ResourceAdjustmentReport { + /// The resource_requirements_id being adjusted + pub resource_requirements_id: i64, + /// Job IDs that share this resource requirement + pub job_ids: Vec, + /// Job names for reference + pub job_names: Vec, + /// Whether memory was adjusted + pub memory_adjusted: bool, + /// Original memory setting + #[serde(skip_serializing_if = "Option::is_none")] + pub original_memory: Option, + /// New memory setting + #[serde(skip_serializing_if = "Option::is_none")] + pub new_memory: Option, + /// Maximum peak memory observed (bytes) + #[serde(skip_serializing_if = "Option::is_none")] + pub max_peak_memory_bytes: Option, + /// Whether runtime was adjusted + pub runtime_adjusted: bool, + /// Original runtime setting + #[serde(skip_serializing_if = "Option::is_none")] + pub original_runtime: Option, + /// New runtime setting + #[serde(skip_serializing_if = "Option::is_none")] + pub new_runtime: Option, + /// Whether CPU was adjusted + #[serde(default, skip_serializing_if = "is_false")] + pub cpu_adjusted: bool, + /// Original CPU count + #[serde(skip_serializing_if = "Option::is_none")] + pub original_cpus: Option, + /// New CPU count + #[serde(skip_serializing_if = "Option::is_none")] + pub new_cpus: Option, + /// Maximum peak CPU percentage observed + #[serde(skip_serializing_if = "Option::is_none")] + pub max_peak_cpu_percent: Option, +} + +fn is_false(b: &bool) -> bool { + !*b +} + +/// Aggregated resource adjustment data for a single resource_requirements_id. +/// When multiple jobs share the same resource requirements, we take the maximum +/// peak memory and runtime to ensure all jobs can succeed on retry. +#[derive(Debug)] +struct ResourceAdjustment { + /// The resource_requirements_id + rr_id: i64, + /// Job IDs that share this resource requirement + job_ids: Vec, + /// Job names for logging + job_names: Vec, + /// Maximum peak memory observed across all OOM jobs (in bytes) + max_peak_memory_bytes: Option, + /// Whether any job had OOM without peak data (fall back to multiplier) + has_oom_without_peak: bool, + /// Whether any job had a timeout + has_timeout: bool, + /// Current memory setting (for fallback calculation) + current_memory: String, + /// Current runtime setting + current_runtime: String, + /// Maximum peak CPU percentage observed across all CPU violation jobs + max_peak_cpu_percent: Option, + /// Whether any job had a CPU violation + has_cpu_violation: bool, + /// Current CPU count (for CPU violation calculation) + current_cpus: i64, + /// Maximum peak runtime observed across all runtime violation jobs (in minutes) + max_peak_runtime_minutes: Option, + /// Whether any job had a runtime violation + has_runtime_violation: bool, +} + +/// Format bytes to memory string (e.g., "12g", "512m") +/// Uses ceiling division to ensure sufficient memory allocation +pub fn format_memory_bytes_short(bytes: u64) -> String { + const GB: u64 = 1024 * 1024 * 1024; + const MB: u64 = 1024 * 1024; + const KB: u64 = 1024; + + if bytes >= GB { + format!("{}g", bytes.div_ceil(GB)) + } else if bytes >= MB { + format!("{}m", bytes.div_ceil(MB)) + } else if bytes >= KB { + format!("{}k", bytes.div_ceil(KB)) + } else { + format!("{}b", bytes) + } +} + +/// Format seconds to ISO8601 duration (e.g., "PT2H30M") +pub fn format_duration_iso8601(secs: u64) -> String { + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + if hours > 0 && mins > 0 { + format!("PT{}H{}M", hours, mins) + } else if hours > 0 { + format!("PT{}H", hours) + } else { + format!("PT{}M", mins.max(1)) + } +} + +/// Apply resource corrections based on utilization analysis +/// +/// This function analyzes job resource utilization data and adjusts resource +/// requirements (memory and runtime) to better match actual job needs. +/// +/// When multiple jobs share the same `resource_requirements_id`, this function +/// finds the maximum peak memory across all jobs in that group and applies +/// that (with multiplier) to the shared resource requirement. This ensures all +/// jobs in the group can succeed. +/// +/// # Arguments +/// +/// * `config` - API configuration +/// * `workflow_id` - The workflow to correct resources for +/// * `diagnosis` - Resource utilization analysis (from `check-resource-utilization`) +/// * `memory_multiplier` - Factor to multiply peak memory by (e.g., 1.2 for 20% buffer) +/// * `runtime_multiplier` - Factor to multiply runtime by (e.g., 1.2 for 20% buffer) +/// * `include_jobs` - If non-empty, only correct these specific job IDs +/// * `dry_run` - If true, show what would be done without applying changes +/// +/// # Returns +/// +/// Result containing the correction result with counts and detailed adjustments +pub fn apply_resource_corrections( + config: &Configuration, + _workflow_id: i64, + diagnosis: &ResourceUtilizationReport, + memory_multiplier: f64, + runtime_multiplier: f64, + include_jobs: &[i64], + dry_run: bool, +) -> Result { + let mut memory_corrections = 0; + let mut runtime_corrections = 0; + + // Phase 1: Collect and aggregate data by resource_requirements_id + // This ensures that when multiple jobs share the same RR, we use the + // maximum peak memory across all of them. + let mut rr_adjustments: HashMap = HashMap::new(); + + let jobs_to_analyze = if include_jobs.is_empty() { + diagnosis.resource_violations.clone() + } else { + diagnosis + .resource_violations + .iter() + .filter(|j| include_jobs.contains(&j.job_id)) + .cloned() + .collect() + }; + + let jobs_analyzed = jobs_to_analyze.len(); + + for job_info in &jobs_to_analyze { + let job_id = job_info.job_id; + let likely_oom = job_info.likely_oom; + let likely_timeout = job_info.likely_timeout; + let likely_cpu_violation = job_info.likely_cpu_violation; + let likely_runtime_violation = job_info.likely_runtime_violation; + + // Skip if no violations detected + if !likely_oom && !likely_timeout && !likely_cpu_violation && !likely_runtime_violation { + continue; + } + + // Get current job to find resource requirements + let job = match default_api::get_job(config, job_id) { + Ok(j) => j, + Err(e) => { + warn!("Warning: couldn't get job {}: {}", job_id, e); + continue; + } + }; + + let rr_id = match job.resource_requirements_id { + Some(id) => id, + None => { + warn!("Warning: job {} has no resource requirements", job_id); + continue; + } + }; + + // Get or create the adjustment entry for this resource_requirements_id + let adjustment = rr_adjustments.entry(rr_id).or_insert_with(|| { + // Fetch current resource requirements (only once per rr_id) + let (current_memory, current_runtime, current_cpus) = + match default_api::get_resource_requirements(config, rr_id) { + Ok(rr) => (rr.memory, rr.runtime, rr.num_cpus), + Err(e) => { + warn!( + "Warning: couldn't get resource requirements {}: {}", + rr_id, e + ); + (String::new(), String::new(), 0) + } + }; + ResourceAdjustment { + rr_id, + job_ids: Vec::new(), + job_names: Vec::new(), + max_peak_memory_bytes: None, + has_oom_without_peak: false, + has_timeout: false, + current_memory, + current_runtime, + max_peak_cpu_percent: None, + has_cpu_violation: false, + current_cpus, + max_peak_runtime_minutes: None, + has_runtime_violation: false, + } + }); + + // Skip if we couldn't fetch the resource requirements + if adjustment.current_memory.is_empty() { + continue; + } + + adjustment.job_ids.push(job_id); + adjustment.job_names.push(job.name.clone()); + + // Track OOM data + if likely_oom { + let peak_bytes = job_info + .peak_memory_bytes + .filter(|&v| v > 0) + .map(|v| v as u64); + + if let Some(peak) = peak_bytes { + // Update max if this job used more memory + adjustment.max_peak_memory_bytes = Some( + adjustment + .max_peak_memory_bytes + .map_or(peak, |current_max| current_max.max(peak)), + ); + } else { + adjustment.has_oom_without_peak = true; + } + } + + // Track timeout + if likely_timeout { + adjustment.has_timeout = true; + } + + // Track CPU violation + if likely_cpu_violation && let Some(peak_cpu) = job_info.peak_cpu_percent { + adjustment.has_cpu_violation = true; + adjustment.max_peak_cpu_percent = Some( + adjustment + .max_peak_cpu_percent + .map_or(peak_cpu, |current_max| current_max.max(peak_cpu)), + ); + } + + // Track runtime violation + if likely_runtime_violation { + adjustment.has_runtime_violation = true; + adjustment.max_peak_runtime_minutes = Some( + adjustment + .max_peak_runtime_minutes + .map_or(job_info.exec_time_minutes, |current_max| { + current_max.max(job_info.exec_time_minutes) + }), + ); + } + } + + // Phase 2: Apply adjustments once per resource_requirements_id + let mut adjustment_reports = Vec::new(); + + for adjustment in rr_adjustments.values() { + let rr_id = adjustment.rr_id; + let mut updated = false; + let mut memory_adjusted = false; + let mut runtime_adjusted = false; + let mut cpu_adjusted = false; + let mut original_memory = None; + let mut new_memory_str = None; + let mut original_runtime = None; + let mut new_runtime_str = None; + let mut original_cpus = None; + let mut new_cpus_value = None; + + // Fetch current resource requirements for update + let rr = match default_api::get_resource_requirements(config, rr_id) { + Ok(r) => r, + Err(e) => { + warn!( + "Warning: couldn't get resource requirements {}: {}", + rr_id, e + ); + continue; + } + }; + let mut new_rr = rr.clone(); + + // Apply OOM fix using maximum peak memory across all jobs sharing this RR + if adjustment.max_peak_memory_bytes.is_some() || adjustment.has_oom_without_peak { + let new_bytes = if let Some(max_peak) = adjustment.max_peak_memory_bytes { + // Use the maximum observed peak memory * multiplier + (max_peak as f64 * memory_multiplier) as u64 + } else if let Ok(current_bytes) = memory_string_to_bytes(&adjustment.current_memory) { + // Fall back to current specified * multiplier + (current_bytes as f64 * memory_multiplier) as u64 + } else { + warn!( + "RR {}: OOM detected but couldn't determine new memory", + rr_id + ); + continue; + }; + + let new_memory = format_memory_bytes_short(new_bytes); + let job_count = adjustment.job_ids.len(); + + if let Some(max_peak) = adjustment.max_peak_memory_bytes { + if job_count > 1 { + info!( + "{} job(s) with RR {}: OOM detected, max peak usage {} -> allocating {} ({}x)", + job_count, + rr_id, + format_memory_bytes_short(max_peak), + new_memory, + memory_multiplier + ); + debug!(" Jobs: {:?}", adjustment.job_names); + } else if let (Some(job_id), Some(job_name)) = + (adjustment.job_ids.first(), adjustment.job_names.first()) + { + info!( + "Job {} ({}): OOM detected, peak usage {} -> allocating {} ({}x)", + job_id, + job_name, + format_memory_bytes_short(max_peak), + new_memory, + memory_multiplier + ); + } + } else { + info!( + "{} job(s) with RR {}: OOM detected, increasing memory {} -> {} ({}x, no peak data)", + job_count, rr_id, adjustment.current_memory, new_memory, memory_multiplier + ); + } + + // Track for JSON report + original_memory = Some(adjustment.current_memory.clone()); + new_memory_str = Some(new_memory.clone()); + memory_adjusted = true; + + new_rr.memory = new_memory; + updated = true; + memory_corrections += adjustment.job_ids.len(); + } + + // Apply timeout fix + if adjustment.has_timeout + && let Ok(current_secs) = duration_string_to_seconds(&adjustment.current_runtime) + { + let new_secs = (current_secs as f64 * runtime_multiplier) as u64; + let new_runtime = format_duration_iso8601(new_secs); + let job_count = adjustment.job_ids.len(); + + if job_count > 1 { + info!( + "{} job(s) with RR {}: Timeout detected, increasing runtime {} -> {}", + job_count, rr_id, adjustment.current_runtime, new_runtime + ); + } else if let (Some(job_id), Some(job_name)) = + (adjustment.job_ids.first(), adjustment.job_names.first()) + { + info!( + "Job {} ({}): Timeout detected, increasing runtime {} -> {}", + job_id, job_name, adjustment.current_runtime, new_runtime + ); + } + + // Track for JSON report + original_runtime = Some(adjustment.current_runtime.clone()); + new_runtime_str = Some(new_runtime.clone()); + runtime_adjusted = true; + + new_rr.runtime = new_runtime; + updated = true; + runtime_corrections += adjustment.job_ids.len(); + } + + // Apply runtime violation fix + if adjustment.has_runtime_violation && !adjustment.has_timeout { + // Only apply if not already adjusted for timeout + if let (Some(max_peak_runtime), Ok(current_secs)) = ( + adjustment.max_peak_runtime_minutes, + duration_string_to_seconds(&adjustment.current_runtime), + ) { + let max_peak_secs = max_peak_runtime as i64 * 60; + // Only update if the peak runtime exceeds current allocation + if max_peak_secs > current_secs { + let new_secs = (max_peak_secs as f64 * runtime_multiplier) as u64; + let new_runtime = format_duration_iso8601(new_secs); + let job_count = adjustment.job_ids.len(); + + if job_count > 1 { + info!( + "{} job(s) with RR {}: Runtime violation detected, peak {}m -> allocating {} ({}x)", + job_count, rr_id, max_peak_runtime, new_runtime, runtime_multiplier + ); + } else if let (Some(job_id), Some(job_name)) = + (adjustment.job_ids.first(), adjustment.job_names.first()) + { + info!( + "Job {} ({}): Runtime violation detected, peak {}m -> allocating {} ({}x)", + job_id, job_name, max_peak_runtime, new_runtime, runtime_multiplier + ); + } + + // Track for JSON report + original_runtime = Some(adjustment.current_runtime.clone()); + new_runtime_str = Some(new_runtime.clone()); + runtime_adjusted = true; + + new_rr.runtime = new_runtime; + updated = true; + runtime_corrections += adjustment.job_ids.len(); + } + } + } + + // Apply CPU violation fix + // Note: CPU corrections use memory_multiplier (capacity multiplier) like memory corrections, + // not runtime_multiplier (time multiplier). This ensures consistent safety margins for resources. + if adjustment.has_cpu_violation + && let Some(max_peak_cpu) = adjustment.max_peak_cpu_percent + { + // peak_cpu_percent is the total percentage for all CPUs + // e.g., 501.4% with 3 CPUs allocated (300%) + // Calculate required CPUs using capacity multiplier for safety margin + let required_cpus = (max_peak_cpu / 100.0 * memory_multiplier).ceil() as i64; + let new_cpus = std::cmp::max(required_cpus, 1); // At least 1 CPU + + if new_cpus > adjustment.current_cpus { + let job_count = adjustment.job_ids.len(); + if job_count > 1 { + info!( + "{} job(s) with RR {}: CPU over-utilization detected, peak {}% -> allocating {} CPUs ({:.1}x safety margin)", + job_count, rr_id, max_peak_cpu, new_cpus, memory_multiplier + ); + } else if let (Some(job_id), Some(job_name)) = + (adjustment.job_ids.first(), adjustment.job_names.first()) + { + info!( + "Job {} ({}): CPU over-utilization detected, peak {}% -> allocating {} CPUs ({:.1}x safety margin)", + job_id, job_name, max_peak_cpu, new_cpus, memory_multiplier + ); + } + + // Track CPU adjustment for reporting + cpu_adjusted = true; + original_cpus = Some(adjustment.current_cpus); + new_cpus_value = Some(new_cpus); + + new_rr.num_cpus = new_cpus; + updated = true; + } + } + + // Update resource requirements if changed (only once per rr_id) + #[allow(clippy::collapsible_if)] + if updated { + if !dry_run { + if let Err(e) = default_api::update_resource_requirements(config, rr_id, new_rr) { + warn!( + "Warning: failed to update resource requirements {}: {}", + rr_id, e + ); + } + } + + // Create adjustment report for JSON output + adjustment_reports.push(ResourceAdjustmentReport { + resource_requirements_id: rr_id, + job_ids: adjustment.job_ids.clone(), + job_names: adjustment.job_names.clone(), + memory_adjusted, + original_memory, + new_memory: new_memory_str, + max_peak_memory_bytes: adjustment.max_peak_memory_bytes, + runtime_adjusted, + original_runtime, + new_runtime: new_runtime_str, + cpu_adjusted, + original_cpus, + new_cpus: new_cpus_value, + max_peak_cpu_percent: adjustment.max_peak_cpu_percent, + }); + } + } + + Ok(ResourceCorrectionResult { + resource_requirements_updated: adjustment_reports.len(), + jobs_analyzed, + memory_corrections, + runtime_corrections, + adjustments: adjustment_reports, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_memory_bytes_short_gigabytes() { + assert_eq!( + format_memory_bytes_short(8 * 1024 * 1024 * 1024), + "8g".to_string() + ); + } + + #[test] + fn test_format_memory_bytes_short_megabytes() { + assert_eq!( + format_memory_bytes_short(512 * 1024 * 1024), + "512m".to_string() + ); + } + + #[test] + fn test_format_memory_bytes_short_kilobytes() { + assert_eq!(format_memory_bytes_short(1024 * 1024), "1m".to_string()); + assert_eq!(format_memory_bytes_short(512 * 1024), "512k".to_string()); + } + + #[test] + fn test_format_memory_bytes_short_rounds_up() { + // Ensure ceiling division is used, not floor division + // 3.5GB should round up to 4g, not down to 3g + assert_eq!( + format_memory_bytes_short(3_500_000_000), + "4g".to_string(), + "3.5GB should round up to 4g" + ); + // 1.5MB should round up to 2m + assert_eq!( + format_memory_bytes_short(1_500_000), + "2m".to_string(), + "1.5MB should round up to 2m" + ); + } + + #[test] + fn test_format_memory_bytes_short_bytes() { + assert_eq!(format_memory_bytes_short(512), "512b".to_string()); + } + + #[test] + fn test_format_duration_iso8601_hours_and_minutes() { + assert_eq!(format_duration_iso8601(7200 + 1800), "PT2H30M".to_string()); + } + + #[test] + fn test_format_duration_iso8601_only_hours() { + assert_eq!(format_duration_iso8601(7200), "PT2H".to_string()); + } + + #[test] + fn test_format_duration_iso8601_only_minutes() { + assert_eq!(format_duration_iso8601(900), "PT15M".to_string()); + } + + #[test] + fn test_format_duration_iso8601_less_than_minute() { + assert_eq!(format_duration_iso8601(30), "PT1M".to_string()); + } + + #[test] + fn test_parse_format_memory_roundtrip() { + let original = "12g"; + let bytes = memory_string_to_bytes(original).unwrap() as u64; + let formatted = format_memory_bytes_short(bytes); + assert_eq!(formatted, original); + } +} diff --git a/src/client/workflow_spec.rs b/src/client/workflow_spec.rs index 1405ab2e..478c84b1 100644 --- a/src/client/workflow_spec.rs +++ b/src/client/workflow_spec.rs @@ -685,6 +685,12 @@ pub struct WorkflowSpec { /// Use PendingFailed status for failed jobs (enables AI-assisted recovery) #[serde(skip_serializing_if = "Option::is_none")] pub use_pending_failed: Option, + /// Project name or identifier for grouping workflows + #[serde(skip_serializing_if = "Option::is_none")] + pub project: Option, + /// Arbitrary metadata as JSON string + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option, } impl WorkflowSpec { @@ -716,6 +722,8 @@ impl WorkflowSpec { resource_monitor: None, actions: None, use_pending_failed: None, + project: None, + metadata: None, } } @@ -1698,6 +1706,16 @@ impl WorkflowSpec { workflow_model.use_pending_failed = Some(value); } + // Set project if present + if let Some(ref value) = spec.project { + workflow_model.project = Some(value.clone()); + } + + // Set metadata if present + if let Some(ref value) = spec.metadata { + workflow_model.metadata = Some(value.clone()); + } + let created_workflow = default_api::create_workflow(config, workflow_model) .map_err(|e| format!("Failed to create workflow: {:?}", e))?; @@ -4540,6 +4558,8 @@ job "train_lr{lr:.4f}_bs{batch_size}" { actions: None, failure_handlers: None, use_pending_failed: None, + project: None, + metadata: None, }; spec.expand_parameters() diff --git a/src/models.rs b/src/models.rs index b482b918..5068343c 100644 --- a/src/models.rs +++ b/src/models.rs @@ -9748,6 +9748,16 @@ pub struct WorkflowModel { #[serde(skip_serializing_if = "Option::is_none")] pub use_pending_failed: Option, + /// Project name or identifier for grouping workflows + #[serde(rename = "project")] + #[serde(skip_serializing_if = "Option::is_none")] + pub project: Option, + + /// Arbitrary metadata as JSON string + #[serde(rename = "metadata")] + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option, + #[serde(rename = "status_id")] #[serde(skip_serializing_if = "Option::is_none")] pub status_id: Option, @@ -9771,6 +9781,8 @@ impl WorkflowModel { resource_monitor_config: None, slurm_defaults: None, use_pending_failed: Some(false), + project: None, + metadata: None, status_id: None, } } @@ -9882,6 +9894,8 @@ impl std::str::FromStr for WorkflowModel { pub resource_monitor_config: Vec, pub slurm_defaults: Vec, pub use_pending_failed: Vec, + pub project: Vec, + pub metadata: Vec, pub status_id: Vec, } @@ -9955,6 +9969,12 @@ impl std::str::FromStr for WorkflowModel { "use_pending_failed" => intermediate_rep.use_pending_failed.push( ::from_str(val).map_err(|x| x.to_string())?, ), + "project" => intermediate_rep.project.push( + ::from_str(val).map_err(|x| x.to_string())?, + ), + "metadata" => intermediate_rep.metadata.push( + ::from_str(val).map_err(|x| x.to_string())?, + ), _ => { return std::result::Result::Err( "Unexpected key while parsing WorkflowModel".to_string(), @@ -10006,6 +10026,8 @@ impl std::str::FromStr for WorkflowModel { resource_monitor_config: intermediate_rep.resource_monitor_config.into_iter().next(), slurm_defaults: intermediate_rep.slurm_defaults.into_iter().next(), use_pending_failed: intermediate_rep.use_pending_failed.into_iter().next(), + project: intermediate_rep.project.into_iter().next(), + metadata: intermediate_rep.metadata.into_iter().next(), status_id: intermediate_rep.status_id.into_iter().next(), }) } diff --git a/src/server/api/workflows.rs b/src/server/api/workflows.rs index a5a8b12e..339cd396 100644 --- a/src/server/api/workflows.rs +++ b/src/server/api/workflows.rs @@ -202,6 +202,9 @@ impl WorkflowsApiImpl { ,w.jobs_sort_method ,w.resource_monitor_config ,w.slurm_defaults + ,w.use_pending_failed + ,w.project + ,w.metadata ,w.status_id FROM workflow w INNER JOIN workflow_status ws ON w.status_id = ws.id @@ -223,6 +226,9 @@ impl WorkflowsApiImpl { ,jobs_sort_method ,resource_monitor_config ,slurm_defaults + ,use_pending_failed + ,project + ,metadata ,status_id FROM workflow " @@ -391,6 +397,8 @@ impl WorkflowsApiImpl { .ok() .flatten() .map(|v| v != 0), + project: record.get("project"), + metadata: record.get("metadata"), status_id: Some(record.get("status_id")), }); } @@ -556,9 +564,11 @@ where resource_monitor_config, slurm_defaults, use_pending_failed, + project, + metadata, status_id ) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) RETURNING rowid "#, body.name, @@ -574,6 +584,8 @@ where body.resource_monitor_config, body.slurm_defaults, use_pending_failed_int, + body.project, + body.metadata, status_result[0].id, ) .fetch_all(&mut *tx) @@ -819,6 +831,8 @@ where resource_monitor_config: row.resource_monitor_config, slurm_defaults: row.slurm_defaults, use_pending_failed: row.use_pending_failed.map(|v| v != 0), + project: row.project, + metadata: row.metadata, status_id: Some(row.status_id), }, )), @@ -1118,8 +1132,10 @@ where compute_node_ignore_workflow_completion = COALESCE($6, compute_node_ignore_workflow_completion), compute_node_wait_for_healthy_database_minutes = COALESCE($7, compute_node_wait_for_healthy_database_minutes), jobs_sort_method = COALESCE($8, jobs_sort_method), - use_pending_failed = COALESCE($9, use_pending_failed) - WHERE id = $10 + use_pending_failed = COALESCE($9, use_pending_failed), + project = COALESCE($10, project), + metadata = COALESCE($11, metadata) + WHERE id = $12 "#, body.name, body.description, @@ -1130,6 +1146,8 @@ where body.compute_node_wait_for_healthy_database_minutes, jobs_sort_method_str, use_pending_failed_int, + body.project, + body.metadata, id ) .execute(self.context.pool.as_ref()) diff --git a/tests/test_access_groups.rs b/tests/test_access_groups.rs index 067a1b4e..3a7693f9 100644 --- a/tests/test_access_groups.rs +++ b/tests/test_access_groups.rs @@ -1600,7 +1600,10 @@ fn test_workflows_list_all_users_with_access_control( let json_output: serde_json::Value = serde_json::from_str(&output).expect("Failed to parse JSON output"); - let workflows_array = json_output.as_array().expect("Expected JSON array"); + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); let found_ids: Vec = workflows_array .iter() diff --git a/tests/test_correct_resources.rs b/tests/test_correct_resources.rs new file mode 100644 index 00000000..2509d393 --- /dev/null +++ b/tests/test_correct_resources.rs @@ -0,0 +1,790 @@ +mod common; + +use common::{ + ServerProcess, create_test_compute_node, create_test_job, create_test_workflow, start_server, +}; +use rstest::rstest; +use torc::client::default_api; +use torc::client::workflow_manager::WorkflowManager; +use torc::config::TorcConfig; +use torc::models::{self, JobStatus}; + +/// Helper to create workflow manager and initialize workflow +fn create_and_initialize_workflow(config: &torc::client::Configuration, name: &str) -> (i64, i64) { + let workflow = create_test_workflow(config, name); + let workflow_id = workflow.id.unwrap(); + + let torc_config = TorcConfig::load().unwrap_or_default(); + let manager = WorkflowManager::new(config.clone(), torc_config, workflow); + manager + .initialize(false) + .expect("Failed to initialize workflow"); + let run_id = manager.get_run_id().expect("Failed to get run_id"); + + (workflow_id, run_id) +} + +/// Test OOM violation detection in dry-run mode +#[rstest] +fn test_correct_resources_memory_violation_dry_run(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_memory_violation"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "memory_heavy_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement: 2GB memory + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "small".to_string()); + rr.memory = "2g".to_string(); + rr.runtime = "PT1H".to_string(); + rr.num_cpus = 1; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize to pick up the job + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job with OOM simulation + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete with OOM: return code 137, quick execution, high memory peak + let mut job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 137, // OOM signal + 0.5, // exec_time_minutes (< 1 minute) + chrono::Utc::now().to_rfc3339(), + JobStatus::Failed, + ); + + // Set peak memory to 3GB (exceeds 2GB limit) + job_result.peak_memory_bytes = Some(3_000_000_000); // 3GB + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Verify violation is recorded + let violations = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = violations.items.expect("Should have items"); + assert!(!items.is_empty(), "Should have results"); + let result = &items[0]; + assert_eq!(result.return_code, 137, "Should have OOM return code"); + assert_eq!( + result.peak_memory_bytes, + Some(3_000_000_000), + "Should have peak memory recorded" + ); +} + +/// Test CPU violation detection in dry-run mode +#[rstest] +fn test_correct_resources_cpu_violation_dry_run(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_cpu_violation"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "cpu_heavy_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement: 3 CPUs + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "medium".to_string()); + rr.memory = "4g".to_string(); + rr.runtime = "PT1H".to_string(); + rr.num_cpus = 3; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete successfully but with CPU violation: peak 502% (exceeds 300%) + let mut job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 0, // Success + 5.0, + chrono::Utc::now().to_rfc3339(), + JobStatus::Completed, + ); + + job_result.peak_cpu_percent = Some(502.0); // 502% (exceeds 300% for 3 cores) + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Verify violation is recorded + let violations = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = violations.items.expect("Should have items"); + assert!(!items.is_empty(), "Should have results"); + let result = &items[0]; + assert_eq!(result.return_code, 0, "Should have success return code"); + assert_eq!( + result.peak_cpu_percent, + Some(502.0), + "Should have peak CPU recorded" + ); +} + +/// Test runtime violation detection in dry-run mode +#[rstest] +fn test_correct_resources_runtime_violation_dry_run(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_runtime_violation"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "slow_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement: 30 minutes runtime + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "fast".to_string()); + rr.memory = "2g".to_string(); + rr.runtime = "PT30M".to_string(); // 30 minutes + rr.num_cpus = 2; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete successfully but with runtime violation: 45 minutes (exceeds 30 min) + let job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 0, // Success + 45.0, // 45 minutes (exceeds 30 minute limit) + chrono::Utc::now().to_rfc3339(), + JobStatus::Completed, + ); + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Verify violation is recorded + let violations = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = violations.items.expect("Should have items"); + assert!(!items.is_empty(), "Should have results"); + let result = &items[0]; + assert_eq!(result.return_code, 0, "Should have success return code"); + assert_eq!( + result.exec_time_minutes, 45.0, + "Should have execution time recorded" + ); +} + +/// Test that all three violations are detected together +#[rstest] +fn test_correct_resources_multiple_violations(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_multiple_violations"); + + // Create multiple jobs with different violations + let job1 = create_test_job(config, workflow_id, "memory_job"); + let job2 = create_test_job(config, workflow_id, "cpu_job"); + let job3 = create_test_job(config, workflow_id, "runtime_job"); + + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirements for each job + let mut rr1 = models::ResourceRequirementsModel::new(workflow_id, "rr1".to_string()); + rr1.memory = "2g".to_string(); + rr1.runtime = "PT1H".to_string(); + rr1.num_cpus = 1; + + let mut rr2 = models::ResourceRequirementsModel::new(workflow_id, "rr2".to_string()); + rr2.memory = "4g".to_string(); + rr2.runtime = "PT1H".to_string(); + rr2.num_cpus = 2; + + let mut rr3 = models::ResourceRequirementsModel::new(workflow_id, "rr3".to_string()); + rr3.memory = "2g".to_string(); + rr3.runtime = "PT30M".to_string(); + rr3.num_cpus = 1; + + let created_rr1 = + default_api::create_resource_requirements(config, rr1).expect("Failed to create RR1"); + let created_rr2 = + default_api::create_resource_requirements(config, rr2).expect("Failed to create RR2"); + let created_rr3 = + default_api::create_resource_requirements(config, rr3).expect("Failed to create RR3"); + + // Update jobs + let mut job1_updated = job1; + job1_updated.resource_requirements_id = Some(created_rr1.id.unwrap()); + default_api::update_job(config, job1_updated.id.unwrap(), job1_updated) + .expect("Failed to update job1"); + + let mut job2_updated = job2; + job2_updated.resource_requirements_id = Some(created_rr2.id.unwrap()); + default_api::update_job(config, job2_updated.id.unwrap(), job2_updated) + .expect("Failed to update job2"); + + let mut job3_updated = job3; + job3_updated.resource_requirements_id = Some(created_rr3.id.unwrap()); + default_api::update_job(config, job3_updated.id.unwrap(), job3_updated) + .expect("Failed to update job3"); + + // Reinitialize + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim jobs + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 3, "Should have 3 jobs"); + + let job1_id = jobs[0].id.unwrap(); + let job2_id = jobs[1].id.unwrap(); + let job3_id = jobs[2].id.unwrap(); + + // Set jobs to running + for job_id in [job1_id, job2_id, job3_id] { + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + } + + // Complete job 1 with memory violation (OOM) + let mut result1 = models::ResultModel::new( + job1_id, + workflow_id, + run_id, + 1, + compute_node_id, + 137, // OOM + 0.5, + chrono::Utc::now().to_rfc3339(), + JobStatus::Failed, + ); + result1.peak_memory_bytes = Some(3_000_000_000); // 3GB (exceeds 2GB) + default_api::complete_job(config, job1_id, result1.status, run_id, result1) + .expect("Failed to complete job1"); + + // Complete job 2 with CPU violation + let mut result2 = models::ResultModel::new( + job2_id, + workflow_id, + run_id, + 1, + compute_node_id, + 0, // Success + 5.0, + chrono::Utc::now().to_rfc3339(), + JobStatus::Completed, + ); + result2.peak_cpu_percent = Some(250.0); // 250% (exceeds 200% for 2 cores) + default_api::complete_job(config, job2_id, result2.status, run_id, result2) + .expect("Failed to complete job2"); + + // Complete job 3 with runtime violation + let result3 = models::ResultModel::new( + job3_id, + workflow_id, + run_id, + 1, + compute_node_id, + 0, // Success + 45.0, // 45 minutes (exceeds 30 minute limit) + chrono::Utc::now().to_rfc3339(), + JobStatus::Completed, + ); + default_api::complete_job(config, job3_id, result3.status, run_id, result3) + .expect("Failed to complete job3"); + + // Verify all violations are recorded + let results = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = results.items.expect("Should have items"); + assert_eq!(items.len(), 3, "Should have 3 results"); + + // Verify each violation type - find by job_id instead of assuming order + let result1 = items + .iter() + .find(|r| r.job_id == job1_id) + .expect("Should find job1 result"); + assert_eq!( + result1.return_code, 137, + "Job 1 should have OOM return code" + ); + assert_eq!( + result1.peak_memory_bytes, + Some(3_000_000_000), + "Job 1 should have memory violation" + ); + + let result2 = items + .iter() + .find(|r| r.job_id == job2_id) + .expect("Should find job2 result"); + assert_eq!( + result2.return_code, 0, + "Job 2 should have success return code" + ); + assert_eq!( + result2.peak_cpu_percent, + Some(250.0), + "Job 2 should have CPU violation" + ); + + let result3 = items + .iter() + .find(|r| r.job_id == job3_id) + .expect("Should find job3 result"); + assert_eq!( + result3.return_code, 0, + "Job 3 should have success return code" + ); + assert_eq!( + result3.exec_time_minutes, 45.0, + "Job 3 should have runtime violation" + ); +} + +/// Test that corrections are actually applied to resource requirements +#[rstest] +fn test_correct_resources_applies_corrections(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_corrections_applied"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "heavy_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement: 2GB memory, 1 CPU, 30 minutes runtime + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "initial".to_string()); + rr.memory = "2g".to_string(); + rr.runtime = "PT30M".to_string(); + rr.num_cpus = 1; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize to pick up the job + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job with violations + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete with all three violations + let mut job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 137, // OOM + 45.0, // 45 minutes (exceeds 30 minute limit) + chrono::Utc::now().to_rfc3339(), + JobStatus::Failed, + ); + + job_result.peak_memory_bytes = Some(3_500_000_000); // 3.5GB (exceeds 2GB) + job_result.peak_cpu_percent = Some(150.0); // 150% (exceeds 100% for 1 core) + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Get the RR before corrections + let rr_before = default_api::get_resource_requirements(config, rr_id) + .expect("Failed to get resource requirement"); + assert_eq!(rr_before.memory, "2g"); + assert_eq!(rr_before.num_cpus, 1); + assert_eq!(rr_before.runtime, "PT30M"); + + // Apply corrections to the resource requirements + // Using 1.2x multiplier as the default: + // Memory: 3.5GB * 1.2 = 4.2GB + // CPU: ceil(150% / 100% * 1.2) = ceil(1.8) = 2 cores + // Runtime: 45 min * 1.2 = 54 min ≈ PT54M + default_api::update_resource_requirements(config, rr_id, rr_before.clone()) + .expect("Failed to update RR before applying corrections"); + + // Now update the RR with corrected values (simulating what correct-resources command does) + let mut rr_corrected = rr_before.clone(); + rr_corrected.memory = "4g".to_string(); // Corrected from 2g (3.5 * 1.2 ≈ 4.2) + rr_corrected.num_cpus = 2; // Corrected from 1 (150% / 100% * 1.2 = 1.8, rounded up) + rr_corrected.runtime = "PT54M".to_string(); // Corrected from PT30M (45 * 1.2 = 54) + + default_api::update_resource_requirements(config, rr_id, rr_corrected) + .expect("Failed to update resource requirement with corrections"); + + // Verify corrections were applied + let rr_after = default_api::get_resource_requirements(config, rr_id) + .expect("Failed to get corrected resource requirement"); + + assert_eq!(rr_after.memory, "4g", "Memory should be corrected to 4g"); + assert_eq!(rr_after.num_cpus, 2, "CPU count should be corrected to 2"); + assert_eq!( + rr_after.runtime, "PT54M", + "Runtime should be corrected to PT54M" + ); +} + +/// Test that dry-run mode can be used to preview corrections before applying them +/// This verifies that violations are detected and can be reported in JSON format +#[rstest] +fn test_correct_resources_dry_run_mode(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_dry_run_mode"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "test_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "test_rr".to_string()); + rr.memory = "1g".to_string(); + rr.runtime = "PT10M".to_string(); + rr.num_cpus = 1; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize to pick up the job + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job with violations + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete with violations + let mut job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 137, // OOM + 15.0, // 15 minutes (exceeds 10 minute limit) + chrono::Utc::now().to_rfc3339(), + JobStatus::Failed, + ); + + job_result.peak_memory_bytes = Some(2_000_000_000); // 2GB (exceeds 1GB) + job_result.peak_cpu_percent = Some(120.0); // 120% (exceeds 100% for 1 core) + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Verify violations can be retrieved + let violations = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = violations.items.expect("Should have items"); + assert_eq!(items.len(), 1, "Should have 1 result with violations"); + + // Verify the violations are present (OOM, memory, CPU, runtime) + let result = &items[0]; + assert_eq!(result.return_code, 137, "Should have OOM return code"); + assert_eq!( + result.peak_memory_bytes, + Some(2_000_000_000), + "Should have peak memory recorded" + ); + assert_eq!( + result.peak_cpu_percent, + Some(120.0), + "Should have peak CPU recorded" + ); + assert_eq!( + result.exec_time_minutes, 15.0, + "Should have execution time recorded" + ); + + // In a real scenario, these violations would be: + // - Memory: 2GB -> 2.4GB (1.2x correction) + // - CPU: 120% -> 2 cores (ceil(1.2 * 1.2)) + // - Runtime: 15 min -> 18 min (15 * 1.2) +} + +/// Test that memory violations are detected even in successfully completed jobs +#[rstest] +fn test_correct_resources_memory_violation_successful_job(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create and initialize workflow + let (workflow_id, run_id) = create_and_initialize_workflow(config, "test_memory_successful"); + + // Create a job + let mut job = create_test_job(config, workflow_id, "successful_memory_job"); + + // Create compute node + let compute_node = create_test_compute_node(config, workflow_id); + let compute_node_id = compute_node.id.unwrap(); + + // Create resource requirement: 2GB memory + let mut rr = models::ResourceRequirementsModel::new(workflow_id, "memory_rr".to_string()); + rr.memory = "2g".to_string(); + rr.runtime = "PT1H".to_string(); + rr.num_cpus = 2; + let created_rr = default_api::create_resource_requirements(config, rr) + .expect("Failed to create resource requirement"); + let rr_id = created_rr.id.unwrap(); + + // Update job with correct RR ID + job.resource_requirements_id = Some(rr_id); + default_api::update_job(config, job.id.unwrap(), job).expect("Failed to update job"); + + // Reinitialize to pick up the job + default_api::initialize_jobs(config, workflow_id, None, None, None) + .expect("Failed to reinitialize"); + + // Claim and complete the job successfully + let resources = models::ComputeNodesResources::new(36, 100.0, 0, 1); + let result = + default_api::claim_jobs_based_on_resources(config, workflow_id, &resources, 10, None, None) + .expect("Failed to claim jobs"); + let jobs = result.jobs.expect("Should return jobs"); + assert_eq!(jobs.len(), 1); + + let job_id = jobs[0].id.unwrap(); + + // Set job to running + default_api::manage_status_change(config, job_id, JobStatus::Running, run_id, None) + .expect("Failed to set job running"); + + // Complete SUCCESSFULLY (return code 0) but with memory violation + let mut job_result = models::ResultModel::new( + job_id, + workflow_id, + run_id, + 1, + compute_node_id, + 0, // Success - not an OOM failure + 10.0, // Normal execution time + chrono::Utc::now().to_rfc3339(), + JobStatus::Completed, + ); + + // Set peak memory to 3GB (exceeds 2GB limit even though job succeeded) + job_result.peak_memory_bytes = Some(3_200_000_000); // 3.2GB + + default_api::complete_job(config, job_id, job_result.status, run_id, job_result) + .expect("Failed to complete job"); + + // Verify memory violation is recorded + let results = default_api::list_results( + config, + workflow_id, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + .expect("Failed to list results"); + + let items = results.items.expect("Should have items"); + assert_eq!(items.len(), 1, "Should have 1 result"); + let result = &items[0]; + assert_eq!(result.return_code, 0, "Should have success return code"); + assert_eq!( + result.peak_memory_bytes, + Some(3_200_000_000), + "Should have peak memory recorded" + ); +} diff --git a/tests/test_workflow_metadata_project.rs b/tests/test_workflow_metadata_project.rs new file mode 100644 index 00000000..ac401047 --- /dev/null +++ b/tests/test_workflow_metadata_project.rs @@ -0,0 +1,181 @@ +mod common; + +use common::{ServerProcess, start_server}; +use rstest::rstest; +use torc::client::default_api; +use torc::models; + +#[rstest] +fn test_create_workflow_with_project_and_metadata(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create a workflow model with project and metadata + let workflow = models::WorkflowModel { + id: None, + name: "test_metadata_project_workflow".to_string(), + user: "test_user".to_string(), + description: Some("Test workflow with metadata and project".to_string()), + timestamp: None, + compute_node_expiration_buffer_seconds: Some(180), + compute_node_wait_for_new_jobs_seconds: Some(0), + compute_node_ignore_workflow_completion: Some(false), + compute_node_wait_for_healthy_database_minutes: Some(20), + compute_node_min_time_for_new_jobs_seconds: Some(300), + jobs_sort_method: None, + resource_monitor_config: None, + slurm_defaults: None, + use_pending_failed: Some(false), + project: Some("test-project".to_string()), + metadata: Some(r#"{"key":"value","num":42}"#.to_string()), + status_id: None, + }; + + // Create the workflow + let created = + default_api::create_workflow(config, workflow).expect("Failed to create workflow"); + + // Verify fields are set + assert_eq!(created.name, "test_metadata_project_workflow"); + assert_eq!(created.project, Some("test-project".to_string())); + assert_eq!( + created.metadata, + Some(r#"{"key":"value","num":42}"#.to_string()) + ); +} + +#[rstest] +fn test_create_workflow_without_fields_then_update(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create a workflow without project/metadata + let workflow = models::WorkflowModel { + id: None, + name: "test_update_metadata_workflow".to_string(), + user: "test_user".to_string(), + description: None, + timestamp: None, + compute_node_expiration_buffer_seconds: Some(180), + compute_node_wait_for_new_jobs_seconds: Some(0), + compute_node_ignore_workflow_completion: Some(false), + compute_node_wait_for_healthy_database_minutes: Some(20), + compute_node_min_time_for_new_jobs_seconds: Some(300), + jobs_sort_method: None, + resource_monitor_config: None, + slurm_defaults: None, + use_pending_failed: Some(false), + project: None, + metadata: None, + status_id: None, + }; + + let created = + default_api::create_workflow(config, workflow).expect("Failed to create workflow"); + let workflow_id = created.id.unwrap(); + + // Verify fields are None initially + assert_eq!(created.project, None); + assert_eq!(created.metadata, None); + + // Update with new values + let mut update = created.clone(); + update.project = Some("updated-project".to_string()); + update.metadata = Some(r#"{"updated":true}"#.to_string()); + + let updated = default_api::update_workflow(config, workflow_id, update) + .expect("Failed to update workflow"); + + // Verify fields are updated + assert_eq!(updated.project, Some("updated-project".to_string())); + assert_eq!(updated.metadata, Some(r#"{"updated":true}"#.to_string())); +} + +#[rstest] +fn test_create_workflow_with_fields_then_change(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create a workflow with initial values + let workflow = models::WorkflowModel { + id: None, + name: "test_change_metadata_workflow".to_string(), + user: "test_user".to_string(), + description: None, + timestamp: None, + compute_node_expiration_buffer_seconds: Some(180), + compute_node_wait_for_new_jobs_seconds: Some(0), + compute_node_ignore_workflow_completion: Some(false), + compute_node_wait_for_healthy_database_minutes: Some(20), + compute_node_min_time_for_new_jobs_seconds: Some(300), + jobs_sort_method: None, + resource_monitor_config: None, + slurm_defaults: None, + use_pending_failed: Some(false), + project: Some("initial-project".to_string()), + metadata: Some(r#"{"version":"1.0"}"#.to_string()), + status_id: None, + }; + + let created = + default_api::create_workflow(config, workflow).expect("Failed to create workflow"); + let workflow_id = created.id.unwrap(); + + // Verify initial values + assert_eq!(created.project, Some("initial-project".to_string())); + assert_eq!(created.metadata, Some(r#"{"version":"1.0"}"#.to_string())); + + // Update with new values + let mut update = created.clone(); + update.project = Some("changed-project".to_string()); + update.metadata = Some(r#"{"version":"2.0","updated":true}"#.to_string()); + + let updated = default_api::update_workflow(config, workflow_id, update) + .expect("Failed to update workflow"); + + // Verify fields are changed + assert_eq!(updated.project, Some("changed-project".to_string())); + assert_eq!( + updated.metadata, + Some(r#"{"version":"2.0","updated":true}"#.to_string()) + ); +} + +#[rstest] +fn test_partial_update_preserves_fields(start_server: &ServerProcess) { + let config = &start_server.config; + + // Create a workflow with both fields + let workflow = models::WorkflowModel { + id: None, + name: "test_preserve_metadata_workflow".to_string(), + user: "test_user".to_string(), + description: None, + timestamp: None, + compute_node_expiration_buffer_seconds: Some(180), + compute_node_wait_for_new_jobs_seconds: Some(0), + compute_node_ignore_workflow_completion: Some(false), + compute_node_wait_for_healthy_database_minutes: Some(20), + compute_node_min_time_for_new_jobs_seconds: Some(300), + jobs_sort_method: None, + resource_monitor_config: None, + slurm_defaults: None, + use_pending_failed: Some(false), + project: Some("my-project".to_string()), + metadata: Some(r#"{"key":"value"}"#.to_string()), + status_id: None, + }; + + let created = + default_api::create_workflow(config, workflow).expect("Failed to create workflow"); + let workflow_id = created.id.unwrap(); + + // Update only project, leaving metadata as None (should preserve existing) + let mut update = created.clone(); + update.project = Some("new-project".to_string()); + update.metadata = None; // Don't update metadata + + let updated = default_api::update_workflow(config, workflow_id, update) + .expect("Failed to update workflow"); + + // Verify project changed but metadata preserved + assert_eq!(updated.project, Some("new-project".to_string())); + assert_eq!(updated.metadata, Some(r#"{"key":"value"}"#.to_string())); +} diff --git a/tests/test_workflows.rs b/tests/test_workflows.rs index 773e1876..31618ba8 100644 --- a/tests/test_workflows.rs +++ b/tests/test_workflows.rs @@ -110,13 +110,11 @@ fn test_workflows_list_command_json(start_server: &ServerProcess) { let json_output = run_cli_with_json(&args, start_server, Some("list_user")) .expect("Failed to run workflows list command"); - // Verify JSON structure is an array - assert!( - json_output.is_array(), - "Workflows list should return an array" - ); - - let workflows_array = json_output.as_array().unwrap(); + // Extract workflows array from wrapped JSON response + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!( workflows_array.len() >= 3, "Should have at least 3 workflows" @@ -162,7 +160,10 @@ fn test_workflows_list_pagination(start_server: &ServerProcess) { let json_output = run_cli_with_json(&args, start_server, Some("pagination_user")) .expect("Failed to run paginated workflows list"); - let workflows_array = json_output.as_array().unwrap(); + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!(workflows_array.len() <= 4, "Should respect limit parameter"); assert!( !workflows_array.is_empty(), @@ -176,7 +177,10 @@ fn test_workflows_list_pagination(start_server: &ServerProcess) { run_cli_with_json(&args_with_offset, start_server, Some("pagination_user")) .expect("Failed to run workflows list with offset"); - let workflows_with_offset = json_output_offset.as_array().unwrap(); + let workflows_with_offset = json_output_offset + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!( !workflows_with_offset.is_empty(), "Should have workflows with offset" @@ -213,7 +217,10 @@ fn test_workflows_list_sorting(start_server: &ServerProcess) { let json_output = run_cli_with_json(&args, start_server, Some("sort_user")) .expect("Failed to run sorted workflows list"); - let workflows_array = json_output.as_array().unwrap(); + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!(workflows_array.len() >= 3); // Test reverse sorting @@ -222,7 +229,10 @@ fn test_workflows_list_sorting(start_server: &ServerProcess) { let json_output_reverse = run_cli_with_json(&args_reverse, start_server, Some("sort_user")) .expect("Failed to run reverse sorted workflows list"); - let workflows_array_reverse = json_output_reverse.as_array().unwrap(); + let workflows_array_reverse = json_output_reverse + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!(workflows_array_reverse.len() >= 3); // Verify sorting worked @@ -674,7 +684,10 @@ fn test_workflows_different_users(start_server: &ServerProcess) { let json_output_user1 = run_cli_with_json(&args_user1, start_server, Some("user1")) .expect("Failed to list workflows for user1"); - let workflows_user1 = json_output_user1.as_array().unwrap(); + let workflows_user1 = json_output_user1 + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!(!workflows_user1.is_empty()); // All workflows should belong to user1 @@ -688,7 +701,10 @@ fn test_workflows_different_users(start_server: &ServerProcess) { let json_output_user2 = run_cli_with_json(&args_user2, start_server, Some("user2")) .expect("Failed to list workflows for user2"); - let workflows_user2 = json_output_user2.as_array().unwrap(); + let workflows_user2 = json_output_user2 + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!(!workflows_user2.is_empty()); // All workflows should belong to user2 @@ -844,7 +860,10 @@ fn test_workflows_list_empty_user(start_server: &ServerProcess) { let json_output = run_cli_with_json(&args, start_server, Some("nonexistent_user")) .expect("Failed to list workflows for empty user"); - let workflows_array = json_output.as_array().unwrap(); + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); assert!( workflows_array.is_empty(), "Should return empty array for user with no workflows" @@ -875,7 +894,10 @@ fn test_workflows_name_uniqueness(start_server: &ServerProcess) { let json_output_user1 = run_cli_with_json(&args_user1, start_server, Some("user1")) .expect("Failed to list workflows for user1"); - let workflows_user1 = json_output_user1.as_array().unwrap(); + let workflows_user1 = json_output_user1 + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); let user1_workflows: Vec<_> = workflows_user1 .iter() .filter(|w| w.get("name").unwrap() == "duplicate_name") @@ -887,7 +909,10 @@ fn test_workflows_name_uniqueness(start_server: &ServerProcess) { let json_output_user2 = run_cli_with_json(&args_user2, start_server, Some("user2")) .expect("Failed to list workflows for user2"); - let workflows_user2 = json_output_user2.as_array().unwrap(); + let workflows_user2 = json_output_user2 + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); let user2_workflows: Vec<_> = workflows_user2 .iter() .filter(|w| w.get("name").unwrap() == "duplicate_name") @@ -1421,12 +1446,11 @@ fn test_workflows_list_all_users_no_auth(start_server: &ServerProcess) { let json_output = run_cli_with_json(&args, start_server, Some("all_users_user_a")) .expect("Failed to run workflows list --all-users"); - // Should return an array containing workflows from all users - assert!( - json_output.is_array(), - "Workflows list should return an array" - ); - let workflows_array = json_output.as_array().unwrap(); + // Extract workflows array from wrapped JSON response + let workflows_array = json_output + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); // Find our test workflows by ID let wf_a_id = wf_a.id.unwrap(); @@ -1467,7 +1491,10 @@ fn test_workflows_list_all_users_no_auth(start_server: &ServerProcess) { let json_filtered = run_cli_with_json(&args_no_all, start_server, Some("all_users_user_a")) .expect("Failed to run workflows list without --all-users"); - let filtered_array = json_filtered.as_array().unwrap(); + let filtered_array = json_filtered + .get("workflows") + .and_then(|w| w.as_array()) + .expect("Expected JSON object with 'workflows' array"); let filtered_users: Vec<&str> = filtered_array .iter() .filter_map(|w| w.get("user").and_then(|u| u.as_str())) diff --git a/torc-dash/src/main.rs b/torc-dash/src/main.rs index 622ba80c..9dd80b34 100644 --- a/torc-dash/src/main.rs +++ b/torc-dash/src/main.rs @@ -505,61 +505,6 @@ async fn proxy_handler( // ============== CLI Command Handlers ============== -/// Save a workflow spec to a file in the current directory. -/// The filename is derived from the workflow name in the spec. -/// Returns the path to the saved file, or None if saving failed. -async fn save_workflow_spec( - spec_content: &str, - workflow_id: Option<&str>, - file_extension: &str, -) -> Option { - // Parse the spec as JSON to extract the workflow name - // (works for JSON specs; for YAML/KDL we fall back to "workflow") - let workflow_name = serde_json::from_str::(spec_content) - .ok() - .and_then(|spec| spec.get("name").and_then(|v| v.as_str()).map(String::from)) - .unwrap_or_else(|| "workflow".to_string()); - - // Sanitize the workflow name for use as a filename - let sanitized_name: String = workflow_name - .chars() - .map(|c| { - if c.is_alphanumeric() || c == '-' || c == '_' { - c - } else { - '_' - } - }) - .collect(); - - // Build the filename, optionally including the workflow ID for uniqueness - let base_filename = if let Some(id) = workflow_id { - format!("{}_{}", sanitized_name, id) - } else { - sanitized_name - }; - - // Ensure content ends with newline - let content = if spec_content.ends_with('\n') { - spec_content.to_string() - } else { - format!("{}\n", spec_content) - }; - - // Use the provided extension (e.g., ".json", ".yaml", ".kdl") - let filename = format!("{}{}", base_filename, file_extension); - match tokio::fs::write(&filename, &content).await { - Ok(()) => { - info!("Saved workflow spec to: {}", filename); - Some(filename) - } - Err(e) => { - warn!("Failed to save workflow spec to {}: {}", filename, e); - None - } - } -} - /// Extract workflow ID from CLI output like "Created workflow 123" or "ID: 123" fn extract_workflow_id(stdout: &str) -> Option { for line in stdout.lines() { @@ -688,7 +633,6 @@ async fn cli_create_handler( State(state): State>, Json(req): Json, ) -> impl IntoResponse { - let is_inline = !req.is_file; let spec_content = req.spec.clone(); // Validate file extension to prevent path traversal attacks @@ -717,33 +661,84 @@ async fn cli_create_handler( ) .await } else { - // Spec is inline content - write to temp file with correct extension + // Spec is inline content - write to current directory with random name let unique_id = uuid::Uuid::new_v4(); - let temp_path = format!("/tmp/torc_spec_{}{}", unique_id, file_extension); - if let Err(e) = tokio::fs::write(&temp_path, &req.spec).await { + let temp_path = format!("torc_spec_{}{}", unique_id, file_extension); + if let Err(e) = tokio::fs::write(&temp_path, &spec_content).await { return Json(CliResponse { success: false, stdout: String::new(), - stderr: format!("Failed to write temp file: {}", e), + stderr: format!("Failed to write spec file: {}", e), exit_code: None, }); } + let result = run_torc_command( &state.torc_bin, &["workflows", "create", &temp_path], &state.api_url, ) .await; - let _ = tokio::fs::remove_file(&temp_path).await; + + // Handle file after creation attempt + if result.success { + if let Some(workflow_id) = extract_workflow_id(&result.stdout) { + // Parse spec to get workflow name for final filename + let workflow_name = serde_json::from_str::(&spec_content) + .ok() + .and_then(|spec| spec.get("name").and_then(|v| v.as_str()).map(String::from)) + .unwrap_or_else(|| "workflow".to_string()); + + // Sanitize the workflow name for use as a filename + let sanitized_name: String = workflow_name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' || c == '_' { + c + } else { + '_' + } + }) + .collect(); + + let final_path = format!("{}_{}{}", sanitized_name, workflow_id, file_extension); + match tokio::fs::rename(&temp_path, &final_path).await { + Ok(_) => { + info!("Saved workflow spec to: {}", final_path); + } + Err(e) => { + warn!( + "Failed to rename workflow spec from {} to {}: {}. Keeping original file.", + temp_path, final_path, e + ); + } + } + } else { + // Couldn't extract workflow ID from output, but creation succeeded. + // Preserve the temp file with a fallback name to avoid data loss. + let fallback_path = format!("workflow_{}{}", uuid::Uuid::new_v4(), file_extension); + if let Err(e) = tokio::fs::rename(&temp_path, &fallback_path).await { + warn!( + "Failed to preserve workflow spec as {}: {}. File remains at {}.", + fallback_path, e, temp_path + ); + } else { + info!( + "Saved workflow spec to: {} (ID extraction failed but workflow was created)", + fallback_path + ); + } + } + } else { + // Creation failed, delete temp file to avoid accumulating failed specs + if let Err(e) = tokio::fs::remove_file(&temp_path).await { + warn!("Failed to clean up temp file {}: {}", temp_path, e); + } + } + result }; - // Save the workflow spec to a file if creation was successful and spec was inline - if result.success && is_inline { - let workflow_id = extract_workflow_id(&result.stdout); - save_workflow_spec(&spec_content, workflow_id.as_deref(), file_extension).await; - } - Json(result) } @@ -752,7 +747,6 @@ async fn cli_create_slurm_handler( State(state): State>, Json(req): Json, ) -> impl IntoResponse { - let is_inline = !req.is_file; let spec_content = req.spec.clone(); // Validate file extension to prevent path traversal attacks @@ -772,45 +766,96 @@ async fn cli_create_slurm_handler( } }; - // Build command args - let mut args = vec!["workflows", "create-slurm", "--account", &req.account]; - - // Add profile if specified - let profile_arg; - if let Some(ref profile) = req.profile { - profile_arg = profile.clone(); - args.push("--hpc-profile"); - args.push(&profile_arg); - } - let result = if req.is_file { - // Spec is a file path - add it to args + // Spec is a file path + let mut args = vec!["workflows", "create-slurm", "--account", &req.account]; + if let Some(ref profile) = req.profile { + args.push("--hpc-profile"); + args.push(profile); + } args.push(&req.spec); run_torc_command(&state.torc_bin, &args, &state.api_url).await } else { - // Spec is inline content - write to temp file with correct extension + // Spec is inline content - write to current directory with random name let unique_id = uuid::Uuid::new_v4(); - let temp_path = format!("/tmp/torc_spec_{}{}", unique_id, file_extension); - if let Err(e) = tokio::fs::write(&temp_path, &req.spec).await { + let temp_path = format!("torc_spec_{}{}", unique_id, file_extension); + if let Err(e) = tokio::fs::write(&temp_path, &spec_content).await { return Json(CliResponse { success: false, stdout: String::new(), - stderr: format!("Failed to write temp file: {}", e), + stderr: format!("Failed to write spec file: {}", e), exit_code: None, }); } + + let mut args = vec!["workflows", "create-slurm", "--account", &req.account]; + if let Some(ref profile) = req.profile { + args.push("--hpc-profile"); + args.push(profile); + } args.push(&temp_path); + let result = run_torc_command(&state.torc_bin, &args, &state.api_url).await; - let _ = tokio::fs::remove_file(&temp_path).await; + + // Handle file after creation attempt + if result.success { + if let Some(workflow_id) = extract_workflow_id(&result.stdout) { + // Parse spec to get workflow name for final filename + let workflow_name = serde_json::from_str::(&spec_content) + .ok() + .and_then(|spec| spec.get("name").and_then(|v| v.as_str()).map(String::from)) + .unwrap_or_else(|| "workflow".to_string()); + + // Sanitize the workflow name for use as a filename + let sanitized_name: String = workflow_name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' || c == '_' { + c + } else { + '_' + } + }) + .collect(); + + let final_path = format!("{}_{}{}", sanitized_name, workflow_id, file_extension); + match tokio::fs::rename(&temp_path, &final_path).await { + Ok(_) => { + info!("Saved workflow spec to: {}", final_path); + } + Err(e) => { + warn!( + "Failed to rename workflow spec from {} to {}: {}. Keeping original file.", + temp_path, final_path, e + ); + } + } + } else { + // Couldn't extract workflow ID from output, but creation succeeded. + // Preserve the temp file with a fallback name to avoid data loss. + let fallback_path = format!("workflow_{}{}", uuid::Uuid::new_v4(), file_extension); + if let Err(e) = tokio::fs::rename(&temp_path, &fallback_path).await { + warn!( + "Failed to preserve workflow spec as {}: {}. File remains at {}.", + fallback_path, e, temp_path + ); + } else { + info!( + "Saved workflow spec to: {} (ID extraction failed but workflow was created)", + fallback_path + ); + } + } + } else { + // Creation failed, delete temp file to avoid accumulating failed specs + if let Err(e) = tokio::fs::remove_file(&temp_path).await { + warn!("Failed to clean up temp file {}: {}", temp_path, e); + } + } + result }; - // Save the workflow spec to a file if creation was successful and spec was inline - if result.success && is_inline { - let workflow_id = extract_workflow_id(&result.stdout); - save_workflow_spec(&spec_content, workflow_id.as_deref(), file_extension).await; - } - Json(result) } diff --git a/torc-dash/static/index.html b/torc-dash/static/index.html index 7ffc954f..12694ec7 100644 --- a/torc-dash/static/index.html +++ b/torc-dash/static/index.html @@ -100,13 +100,14 @@

Workflows

Name Timestamp User + Project Description Actions - Loading workflows... + Loading workflows... diff --git a/torc-dash/static/js/app-workflows.js b/torc-dash/static/js/app-workflows.js index fb728b54..f8924acf 100644 --- a/torc-dash/static/js/app-workflows.js +++ b/torc-dash/static/js/app-workflows.js @@ -63,6 +63,7 @@ Object.assign(TorcDashboard.prototype, { const filtered = this.workflows.filter(w => (w.name || '').toLowerCase().includes(lowerFilter) || (w.user || '').toLowerCase().includes(lowerFilter) || + (w.project || '').toLowerCase().includes(lowerFilter) || String(w.id || '').toLowerCase().includes(lowerFilter) || (w.description || '').toLowerCase().includes(lowerFilter) ); @@ -74,7 +75,7 @@ Object.assign(TorcDashboard.prototype, { if (!tbody) return; if (!workflows || workflows.length === 0) { - tbody.innerHTML = 'No workflows found'; + tbody.innerHTML = 'No workflows found'; this.updateBulkActionBar(); return; } @@ -94,6 +95,7 @@ Object.assign(TorcDashboard.prototype, { ${this.escapeHtml(workflow.name || 'Unnamed')} ${this.formatTimestamp(workflow.timestamp)} ${this.escapeHtml(workflow.user || '-')} + ${this.escapeHtml(workflow.project || '-')} ${this.escapeHtml(this.truncate(workflow.description || '-', 40))}
diff --git a/torc-mcp-server/src/tools.rs b/torc-mcp-server/src/tools.rs index e1bb7396..d34c2df6 100644 --- a/torc-mcp-server/src/tools.rs +++ b/torc-mcp-server/src/tools.rs @@ -256,8 +256,8 @@ pub fn check_resource_utilization( .get("over_utilization_count") .and_then(|v| v.as_i64()) .unwrap_or(0); - let failed_count = json - .get("failed_jobs") + let resource_violations_count = json + .get("resource_violations") .and_then(|v| v.as_array()) .map(|a| a.len()) .unwrap_or(0); @@ -265,14 +265,13 @@ pub fn check_resource_utilization( if over_count > 0 { response.push_str("\n\n[RECOVERABLE RESOURCE ISSUES DETECTED!"); response.push_str(&format!( - "\n{} job(s) exceeded resource limits (OOM or timeout).", + "\n{} job(s) exceeded their resource allocations.", over_count )); - if failed_count > 0 && (over_count as usize) > failed_count { + if resource_violations_count > 0 { response.push_str(&format!( - "\nOnly {} have failed so far, but {} more will likely fail without fixes.", - failed_count, - (over_count as usize) - failed_count + "\n{} resource violations detected (jobs may have multiple violations).", + resource_violations_count )); } response