diff --git a/.bazelci/run_server_test.sh b/.bazelci/run_server_test.sh index 21b6d70389..54e4658b16 100755 --- a/.bazelci/run_server_test.sh +++ b/.bazelci/run_server_test.sh @@ -8,11 +8,11 @@ bazel build //src/main/java/build/buildfarm:buildfarm-shard-worker bazel build //src/main/java/build/buildfarm:buildfarm-server # Start a single worker -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > server.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & echo "Started buildfarm-shard-worker..." # Start a single server -bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > server.log 2>&1 & echo "Started buildfarm-server..." echo "Wait for startup to finish..." diff --git a/.bazelrc b/.bazelrc index cfea58ba6a..c3caf4d8fb 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,3 +1,9 @@ +build --java_language_version=17 +build --java_runtime_version=remotejdk_17 + +build --tool_java_language_version=17 +build --tool_java_runtime_version=remotejdk_17 + common --enable_platform_specific_config build:fuse --define=fuse=true @@ -14,3 +20,7 @@ test --test_tag_filters=-redis,-integration # Ensure buildfarm is compatible with future versions of bazel. # https://buildkite.com/bazel/bazelisk-plus-incompatible-flags common --incompatible_disallow_empty_glob + +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 +common --noenable_bzlmod diff --git a/.bazelversion b/.bazelversion index 5e3254243a..19b860c187 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -6.1.2 +6.4.0 diff --git a/BUILD b/BUILD index fc380f9846..f1cf17c796 100644 --- a/BUILD +++ b/BUILD @@ -121,7 +121,7 @@ sh_binary( java_image( name = "buildfarm-server", args = ["/app/build_buildfarm/examples/config.minimal.yml"], - base = "@amazon_corretto_java_image_base//image", + base = "@ubuntu-mantic//image", classpath_resources = [ "//src/main/java/build/buildfarm:configs", ], @@ -149,14 +149,14 @@ oss_audit( # Download cgroup-tools so that the worker is able to restrict actions via control groups. download_pkgs( name = "worker_pkgs", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", packages = ["cgroup-tools"], tags = ["container"], ) install_pkgs( name = "worker_pkgs_image", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", installables_tar = ":worker_pkgs.tar", installation_cleanup_commands = "rm -rf /var/lib/apt/lists/*", output_image_name = "worker_pkgs_image", diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..cabbbe697c --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,2 @@ +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 diff --git a/README.md b/README.md index 8dc99874b3..cd9caf8ee3 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ All commandline options override corresponding config settings. Run via -``` -docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 +```shell +$ docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 redis-cli config set stop-writes-on-bgsave-error no ``` @@ -28,10 +28,10 @@ redis-cli config set stop-writes-on-bgsave-error no Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: **`configfile`** has to be in [yaml format](https://bazelbuild.github.io/bazel-buildfarm/docs/configuration). @@ -40,10 +40,10 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag= Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: @@ -53,9 +53,9 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm To use the example configured buildfarm with bazel (version 1.0 or higher), you can configure your `.bazelrc` as follows: -``` +```shell $ cat .bazelrc -build --remote_executor=grpc://localhost:8980 +$ build --remote_executor=grpc://localhost:8980 ``` Then run your build as you would normally do. @@ -67,20 +67,20 @@ Buildfarm uses [Java's Logging framework](https://docs.oracle.com/javase/10/core You can use typical Java logging configuration to filter these results and observe the flow of executions through your running services. An example `logging.properties` file has been provided at [examples/logging.properties](examples/logging.properties) for use as follows: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` and -``` -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +``` shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` To attach a remote debugger, run the executable with the `--debug=` flag. For example: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml ``` diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index cbd286587f..466375373f 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -7,7 +7,7 @@ has_children: true Minimal required: -``` +```yaml backplane: redisUri: "redis://localhost:6379" queues: @@ -21,24 +21,25 @@ worker: publicName: "localhost:8981" ``` -The configuration can be provided to the server and worker as a CLI argument or through the env variable `CONFIG_PATH` -For an example config containing all of the configuration values, see `examples/config.yml`. +The configuration can be provided to the server and worker as a CLI argument or through the environment variable `CONFIG_PATH` +For an example configuration containing all of the configuration values, see `examples/config.yml`. ## All Configurations ### Common -| Configuration | Accepted and _Default_ Values | Description | -|----------------------|-------------------------------|---------------------------------------------------| -| digestFunction | _SHA256_, SHA1 | Digest function for this implementation | -| defaultActionTimeout | Integer, _600_ | Default timeout value for an action (seconds) | -| maximumActionTimeout | Integer, _3600_ | Maximum allowed action timeout (seconds) | -| maxEntrySizeBytes | Long, _2147483648_ | Maximum size of a single blob accepted (bytes) | -| prometheusPort | Integer, _9090_ | Listening port of the Prometheus metrics endpoint | +| Configuration | Accepted and _Default_ Values | Command Line Argument | Description | +|------------------------------|-------------------------------|-----------------------|--------------------------------------------------------------| +| digestFunction | _SHA256_, SHA1 | | Digest function for this implementation | +| defaultActionTimeout | Integer, _600_ | | Default timeout value for an action (seconds) | +| maximumActionTimeout | Integer, _3600_ | | Maximum allowed action timeout (seconds) | +| maxEntrySizeBytes | Long, _2147483648_ | | Maximum size of a single blob accepted (bytes) | +| prometheusPort | Integer, _9090_ | --prometheus_port | Listening port of the Prometheus metrics endpoint | +| allowSymlinkTargetAbsolute | boolean, _false_ | | Permit inputs to contain symlinks with absolute path targets | Example: -``` +```yaml digestFunction: SHA1 defaultActionTimeout: 1800 maximumActionTimeout: 1800 @@ -51,33 +52,35 @@ worker: ### Server -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|----------------------------------|-------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | -| name | String, _shard_ | | Implementation name | -| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | -| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | -| port | Integer, _8980_ | | Listening port of the GRPC server | -| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | -| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | -| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | -| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | -| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | -| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | -| runOperationQueuer | boolean, _true_ | | Aquire execute request entries cooperatively from an arrival queue on the backplane | -| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | -| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | -| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | -| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | -| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | -| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | -| recordBesEvents | boolean, _false_ | | Allow recording of BES events | -| clusterId | String, _local_ | | Buildfarm cluster ID | -| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| Configuration | Accepted and _Default_ Values | Environment Var | Description | +|----------------------------------|-------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------| +| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | +| name | String, _shard_ | | Implementation name | +| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | +| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | +| port | Integer, _8980_ | | Listening port of the GRPC server | +| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | +| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | +| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | +| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | +| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | +| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | +| runOperationQueuer | boolean, _true_ | | Acquire execute request entries cooperatively from an arrival queue on the backplane | +| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | +| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | +| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | +| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | +| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | +| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | +| recordBesEvents | boolean, _false_ | | Allow recording of BES events | +| clusterId | String, _local_ | | Buildfarm cluster ID | +| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for connections in flight to finish when shutdown signal is received | + Example: -``` +```yaml server: instanceType: SHARD name: shard @@ -94,7 +97,7 @@ server: Example: -``` +```yaml server: grpcMetrics: enabled: false @@ -103,16 +106,16 @@ server: ### Server Caches -| Configuration | Accepted and _Default_ Values | Description | -|--------------------------|-------------------------------|--------------------------------------------------------| -| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | -| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | -| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | -| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | +| Configuration | Accepted and _Default_ Values | Description | +|---------------------------------------|-------------------------------|----------------------------------------------------------------------| +| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | +| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | +| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | +| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | Example: -``` +```yaml server: caches: directoryCacheMaxEntries: 10000 @@ -130,7 +133,7 @@ server: Example: -``` +```yaml server: admin: deploymentEnvironment: AWS @@ -149,14 +152,14 @@ server: Example: -``` +```yaml server: metrics: publisher: log logLevel: INFO ``` -``` +```yaml server: metrics: publisher: aws @@ -167,45 +170,45 @@ server: ### Redis Backplane -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|------------------------------|------------------------------------------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | _SHARD_ | | Type of backplane. Currently, the only implemntation is SHARD utilizing Redis | -| redisUri | String, redis://localhost:6379 | REDIS_URI | Redis cluster endpoint. This must be a single URI | -| redisPassword | String, _null_ | | Redis password, if applicable | -| redisNodes | List of Strings, _null_ | | List of individual Redis nodes, if applicable | -| jedisPoolMaxTotal | Integer, _4000_ | | The size of the Redis connection pool | -| workersHashName | String, _Workers_ | | Redis key used to store a hash of registered workers | -| workerChannel | String, _WorkerChannel_ | | Redis pubsub channel key where changes of the cluster membership are announced | -| actionCachePrefix | String, _ActionCache_ | | Redis key prefix for all ActionCache entries | -| actionCacheExpire | Integer, _2419200_ | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | -| actionBlacklistPrefix | String, _ActionBlacklist_ | | Redis key prefix for all blacklisted actions, which are rejected | -| actionBlacklistExpire | Integer, _3600_ | | The TTL maintained for action blacklist entries | -| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | -| operationPrefix | String, _Operation_ | | Redis key prefix for all operations, suffixed wtih the operation's name | -| operationExpire | Integer, _604800_ | | The TTL maintained for all operations, updated on each modification | -| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | -| processingListName | String, _{Arrival}:ProcessingOperations_ | | Redis key of a list used to ensure reliable processing of arrival queue etries with operation watch monitoring | -| processingPrefix | String, _Processing_ | | Redis key prefix for operations which are being dequeued from the arrival queue | -| processingTimeoutMillis | Integer, _20000_ | | Delay (in ms) used to populate processing operation entries | -| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | Redis key used to store a list of QueueEntry awaiting execution by workers | -| dispatchingPrefix | String, _Dispatching_ | | Redis key prefix for operations which are being dequeued from the ready to run queue | -| dispatchingTimeoutMillis | Integer, _10000_ | | Delay (in ms) used to populate dispatching operation entries | -| dispatchedOperationsHashName | String, _DispatchedOperations_ | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | -| operationChannelPrefix | String, _OperationChannel_ | | Redis pubsub channel prefix suffixed by an operation name | -| casPrefix | String, _ContentAddressableStorage_ | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | -| casExpire | Integer, _604800_ | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | -| subscribeToBackplane | boolean, _true_ | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | -| runFailsafeOperation | boolean, _true_ | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | -| maxQueueDepth | Integer, _100000_ | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | -| maxPreQueueDepth | Integer, _1000000_ | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | -| priorityQueue | boolean, _false_ | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | -| timeout | Integer, _10000_ | | Default timeout | -| maxAttempts | Integer, _20_ | | Maximum number of execution attempts | -| cacheCas | boolean, _false_ | | | +| Configuration | Accepted and _Default_ Values | Environment Var | Command Line Argument | Description | +|------------------------------|------------------------------------------|-----------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | _SHARD_ | | | Type of backplane. Currently, the only implementation is SHARD utilizing Redis | +| redisUri | String, redis://localhost:6379 | REDIS_URI | --redis_uri | Redis cluster endpoint. This must be a single URI | +| redisPassword | String, _null_ | | | Redis password, if applicable | +| redisNodes | List of Strings, _null_ | | | List of individual Redis nodes, if applicable | +| jedisPoolMaxTotal | Integer, _4000_ | | | The size of the Redis connection pool | +| workersHashName | String, _Workers_ | | | Redis key used to store a hash of registered workers | +| workerChannel | String, _WorkerChannel_ | | | Redis pubsub channel key where changes of the cluster membership are announced | +| actionCachePrefix | String, _ActionCache_ | | | Redis key prefix for all ActionCache entries | +| actionCacheExpire | Integer, _2419200_ | | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | +| actionBlacklistPrefix | String, _ActionBlacklist_ | | | Redis key prefix for all blacklisted actions, which are rejected | +| actionBlacklistExpire | Integer, _3600_ | | | The TTL maintained for action blacklist entries | +| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | +| operationPrefix | String, _Operation_ | | | Redis key prefix for all operations, suffixed with the operation's name | +| operationExpire | Integer, _604800_ | | | The TTL maintained for all operations, updated on each modification | +| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | +| processingListName | String, _{Arrival}:ProcessingOperations_ | | | Redis key of a list used to ensure reliable processing of arrival queue entries with operation watch monitoring | +| processingPrefix | String, _Processing_ | | | Redis key prefix for operations which are being dequeued from the arrival queue | +| processingTimeoutMillis | Integer, _20000_ | | | Delay (in ms) used to populate processing operation entries | +| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | | Redis key used to store a list of QueueEntry awaiting execution by workers | +| dispatchingPrefix | String, _Dispatching_ | | | Redis key prefix for operations which are being dequeued from the ready to run queue | +| dispatchingTimeoutMillis | Integer, _10000_ | | | Delay (in ms) used to populate dispatching operation entries | +| dispatchedOperationsHashName | String, _DispatchedOperations_ | | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | +| operationChannelPrefix | String, _OperationChannel_ | | | Redis pubsub channel prefix suffixed by an operation name | +| casPrefix | String, _ContentAddressableStorage_ | | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | +| casExpire | Integer, _604800_ | | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | +| subscribeToBackplane | boolean, _true_ | | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | +| runFailsafeOperation | boolean, _true_ | | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | +| maxQueueDepth | Integer, _100000_ | | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | +| maxPreQueueDepth | Integer, _1000000_ | | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | +| priorityQueue | boolean, _false_ | | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | +| timeout | Integer, _10000_ | | | Default timeout | +| maxAttempts | Integer, _20_ | | | Maximum number of execution attempts | +| cacheCas | boolean, _false_ | | | | Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -222,7 +225,7 @@ backplane: Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -259,8 +262,9 @@ backplane: | errorOperationRemainingResources | boolean, _false_ | | | | realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | | gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for operations in flight to finish when shutdown signal is received | +| createSymlinkOutputs | boolean, _false_ | | Creates SymlinkNodes for symbolic links discovered in output paths for actions. No verification of the symlink target path occurs. Buildstream, for example, requires this. | -``` +```yaml worker: port: 8981 publicName: "localhost:8981" @@ -277,7 +281,7 @@ worker: Example: -``` +```yaml worker: capabilities: cas: true @@ -294,7 +298,7 @@ worker: Example: -``` +```yaml worker: sandboxSettings: alwaysUse: true @@ -311,7 +315,7 @@ worker: Example: -``` +```yaml worker: dequeueMatchSettings: acceptEverything: true @@ -331,7 +335,7 @@ worker: Example: -``` +```yaml worker: storages: - type: FILESYSTEM @@ -359,7 +363,7 @@ worker: Example: -``` +```yaml worker: executionPolicies: - name: test diff --git a/_site/docs/contribute/design-documents.md b/_site/docs/contribute/design-documents.md index f7b1c0b80d..ea608696f3 100644 --- a/_site/docs/contribute/design-documents.md +++ b/_site/docs/contribute/design-documents.md @@ -5,4 +5,5 @@ parent: Contribute nav_order: 2 --- -[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) \ No newline at end of file +[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) +[Local and Global Resources](https://docs.google.com/document/d/1u0TkmVmdMS53PWR1hgh-a_cj3NmQYE0Favv9aGFfQZs/edit?usp=sharing) \ No newline at end of file diff --git a/_site/docs/execution/builds-without-the-bytes.md b/_site/docs/execution/builds-without-the-bytes.md index abda8c86b5..5c49025877 100644 --- a/_site/docs/execution/builds-without-the-bytes.md +++ b/_site/docs/execution/builds-without-the-bytes.md @@ -7,7 +7,7 @@ nav_order: 4 # Builds Without The Bytes -tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)` to your BWOB bazel invocations. +tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)`, to your BWOB bazel invocations, or enable `ensureOutputsPresent` in your config to set it globally. As proposed in this [issue](https://github.com/bazelbuild/bazel/issues/6862) and the accompanying document, bazel endeavors to provide a mechanism to be 'content-light' for remote execution, using only content reference addresses to request action execution and construct successively dependent action definitions. @@ -17,4 +17,4 @@ This puts BuildFarm in the uncomfortable position of never being able to expire To combat this, you can provide some metadata to buildfarm that will help to limit (but will not remove the possibility of) failed builds. -Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. \ No newline at end of file +Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. diff --git a/_site/docs/execution/environment.md b/_site/docs/execution/environment.md index 4a17aabe6b..d2cbca0b51 100644 --- a/_site/docs/execution/environment.md +++ b/_site/docs/execution/environment.md @@ -116,7 +116,7 @@ Next we will create a BUILD file to create our target image. We will use the sha ``` load("@io_bazel_rules_docker//container:container.bzl", "container_image") -java_image( +container_image( name = "buildfarm-shard-worker-ubuntu20-java14", base = "@ubuntu20_java14_image_base//image", files = [ diff --git a/_site/docs/execution/execution_policies.md b/_site/docs/execution/execution_policies.md index 19698a9925..f0eaf295d9 100644 --- a/_site/docs/execution/execution_policies.md +++ b/_site/docs/execution/execution_policies.md @@ -17,7 +17,7 @@ This policy type specifies that a worker should prepend a single path, and a num This example will use the buildfarm-provided executable `as-nobody`, which will upon execution demote itself to a `nobody` effective process owner uid, and perform an `execvp(2)` with the remaining provided program arguments, which will subsequently execute as a user that no longer matches the worker process. -``` +```yaml # default wrapper policy application worker: executionPolicies: @@ -50,7 +50,8 @@ These wrappers are used for detecting actions that rely on time. Below is a dem This addresses two problems in regards to an action's dependence on time. The 1st problem is when an action takes longer than it should because it's sleeping unnecessarily. The 2nd problem is when an action relies on time which causes it to eventually be broken on master despite the code not changing. Both problems are expressed below as unit tests. We demonstrate a time-spoofing mechanism (the re-writing of syscalls) which allows us to detect these problems generically over any action. The objective is to analyze builds for performance inefficiency and discover future instabilities before they occur. ### Issue 1 (slow test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -58,16 +59,19 @@ echo -n "testing... " sleep 10; echo "done" ``` + The test takes 10 seconds to run on average. -``` -bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 10.2s Stats over 10 runs: max = 10.2s, min = 10.1s, avg = 10.2s, dev = 0.0s ``` We can check for performance improvements by using the `skip-sleep` option. -``` -bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 1.0s Stats over 10 runs: max = 1.0s, min = 0.9s, avg = 1.0s, dev = 0.0s ``` @@ -75,7 +79,8 @@ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties=' Now the test is 10x faster. If skipping sleep makes an action perform significantly faster without affecting its success rate, that would warrant further investigation into the action's implementation. ### Issue 2 (future failing test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -89,12 +94,15 @@ echo "Times change." date exit -1; ``` + The test passes today, but will it pass tomorrow? Will it pass a year from now? We can find out by using the `time-shift` option. -``` -bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail + +```shell +$ bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail INFO: Found 1 test target... Times change. Mon Sep 25 18:31:09 UTC 2023 //cloud/buildfarm:future_fail FAILED in 18.0s ``` + Time is shifted to the year 2023 and the test now fails. We can fix the problem before others see it. diff --git a/_site/docs/execution/execution_properties.md b/_site/docs/execution/execution_properties.md index 85579ed099..7966c70dd2 100644 --- a/_site/docs/execution/execution_properties.md +++ b/_site/docs/execution/execution_properties.md @@ -76,37 +76,42 @@ Despite being given 1 core, they see all of the cpus and decide to spawn that ma **Standard Example:** This test will succeed when env var TESTVAR is foobar, and fail otherwise. -``` + +```shell #!/bin/bash [ "$TESTVAR" = "foobar" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ +```shell +$ ./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main PASS ``` + **Template Example:** If you give a range of cores, buildfarm has the authority to decide how many your operation actually claims. You can let buildfarm resolve this value for you (via [mustache](https://mustache.github.io/)). -``` +```bash #!/bin/bash [ "$MKL_NUM_THREADS" = "1" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_default_exec_properties='env-vars="MKL_NUM_THREADS": "{{limits.cpu.claimed}}"' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main diff --git a/_site/docs/quick_start.md b/_site/docs/quick_start.md index 7af957ee63..8a9a9234db 100644 --- a/_site/docs/quick_start.md +++ b/_site/docs/quick_start.md @@ -25,7 +25,8 @@ Let's start with a bazel workspace with a single file to compile into an executa Create a new directory for our workspace and add the following files: `main.cc`: -``` + +```c #include int main( int argc, char *argv[] ) @@ -35,7 +36,8 @@ int main( int argc, char *argv[] ) ``` `BUILD`: -``` + +```starlark cc_binary( name = "main", srcs = ["main.cc"], @@ -118,15 +120,18 @@ That `2 remote` indicates that your compile and link ran remotely. Congratulatio ## Container Quick Start To bring up a minimal buildfarm cluster, you can run: + +```shell +$ ./examples/bf-run start ``` -./examples/bf-run start -``` + This will start all of the necessary containers at the latest version. Once the containers are up, you can build with `bazel run --remote_executor=grpc://localhost:8980 :main`. To stop the containers, run: -``` -./examples/bf-run stop + +```shell +$ ./examples/bf-run stop ``` ## Next Steps @@ -137,8 +142,8 @@ We've started our worker on the same host as our server, and also the same host You can now easily launch a new Buildfarm cluster locally or in AWS using an open sourced [Buildfarm Manager](https://github.com/80degreeswest/bfmgr). -``` -wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar -java -jar bfmgr-1.0.7.jar -Navigate to http://localhost +```shell +$ wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar +$ java -jar bfmgr-1.0.7.jar +$ open http://localhost ``` diff --git a/admin/main/pom.xml b/admin/main/pom.xml index 006217ccb4..8e128ee64c 100644 --- a/admin/main/pom.xml +++ b/admin/main/pom.xml @@ -94,7 +94,7 @@ org.json json - 20230227 + 20231013 org.projectlombok diff --git a/admin/main/src/main/resources/proto/buildfarm.proto b/admin/main/src/main/resources/proto/buildfarm.proto index f99feb3ac9..4349babc16 100644 --- a/admin/main/src/main/resources/proto/buildfarm.proto +++ b/admin/main/src/main/resources/proto/buildfarm.proto @@ -532,7 +532,7 @@ message RedisShardBackplaneConfig { int32 max_attempts = 33; } -message ShardInstanceConfig { +message ServerInstanceConfig { bool run_dispatched_monitor = 1; int32 dispatched_monitor_interval_seconds = 2; @@ -556,7 +556,7 @@ message ShardInstanceConfig { google.protobuf.Duration grpc_timeout = 8; } -message ShardWorkerInstanceConfig { +message WorkerInstanceConfig { // whether to stream stdout from processes bool stream_stdout = 6; @@ -580,7 +580,7 @@ message ShardWorkerInstanceConfig { } message ShardWorkerConfig { - ShardWorkerInstanceConfig shard_worker_instance_config = 1; + WorkerInstanceConfig shard_worker_instance_config = 1; int32 port = 2; @@ -850,7 +850,7 @@ message InstanceConfig { oneof type { MemoryInstanceConfig memory_instance_config = 3; - ShardInstanceConfig shard_instance_config = 4; + ServerInstanceConfig shard_instance_config = 4; } } diff --git a/defs.bzl b/defs.bzl index 621c03a943..b0a4ececa2 100644 --- a/defs.bzl +++ b/defs.bzl @@ -8,7 +8,7 @@ load( "@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories", ) -load("@io_grpc_grpc_java//:repositories.bzl", "grpc_java_repositories") +load("@io_grpc_grpc_java//:repositories.bzl", "IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS", "grpc_java_repositories") load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") load("@com_grail_bazel_toolchain//toolchain:rules.bzl", "llvm_toolchain") load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories") @@ -44,27 +44,8 @@ IO_GRPC_MODULES = [ ] COM_AWS_MODULES = [ - "autoscaling", - "core", - "ec2", - "secretsmanager", - "sns", - "ssm", "s3", -] - -ORG_SPRING_MODULES = [ - "spring-beans", - "spring-core", - "spring-context", - "spring-web", -] - -ORG_SPRING_BOOT_MODULES = [ - "spring-boot-autoconfigure", - "spring-boot", - "spring-boot-starter-web", - "spring-boot-starter-thymeleaf", + "secretsmanager", ] def buildfarm_init(name = "buildfarm"): @@ -80,6 +61,7 @@ def buildfarm_init(name = "buildfarm"): "com.fasterxml.jackson.core:jackson-databind:2.15.0", "com.github.ben-manes.caffeine:caffeine:2.9.0", "com.github.docker-java:docker-java:3.2.11", + "com.github.fppt:jedis-mock:1.0.10", "com.github.jnr:jffi:1.2.16", "com.github.jnr:jffi:jar:native:1.2.16", "com.github.jnr:jnr-constants:0.9.9", @@ -99,13 +81,12 @@ def buildfarm_init(name = "buildfarm"): "com.google.guava:guava:32.1.1-jre", "com.google.j2objc:j2objc-annotations:1.1", "com.google.jimfs:jimfs:1.1", - "com.google.protobuf:protobuf-java-util:3.10.0", - "com.google.protobuf:protobuf-java:3.10.0", + "com.google.protobuf:protobuf-java-util:3.19.1", + "com.google.protobuf:protobuf-java:3.19.1", "com.google.truth:truth:0.44", "org.slf4j:slf4j-simple:1.7.35", "com.googlecode.json-simple:json-simple:1.1.1", "com.jayway.jsonpath:json-path:2.4.0", - "io.github.lognet:grpc-spring-boot-starter:4.5.4", "org.bouncycastle:bcprov-jdk15on:1.70", "net.jcip:jcip-annotations:1.0", ] + ["io.netty:netty-%s:4.1.94.Final" % module for module in IO_NETTY_MODULES] + @@ -128,9 +109,6 @@ def buildfarm_init(name = "buildfarm"): "org.openjdk.jmh:jmh-core:1.23", "org.openjdk.jmh:jmh-generator-annprocess:1.23", "org.redisson:redisson:3.13.1", - ] + ["org.springframework.boot:%s:2.7.4" % module for module in ORG_SPRING_BOOT_MODULES] + - ["org.springframework:%s:5.3.23" % module for module in ORG_SPRING_MODULES] + - [ "org.threeten:threetenbp:1.3.3", "org.xerial:sqlite-jdbc:3.34.0", "org.jetbrains:annotations:16.0.2", @@ -138,6 +116,7 @@ def buildfarm_init(name = "buildfarm"): "org.projectlombok:lombok:1.18.24", ], generate_compat_repositories = True, + override_targets = IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS, repositories = [ "https://repo1.maven.org/maven2", "https://mirrors.ibiblio.org/pub/mirrors/maven2", diff --git a/deps.bzl b/deps.bzl index 912ba6e03c..525bcff752 100644 --- a/deps.bzl +++ b/deps.bzl @@ -5,8 +5,8 @@ buildfarm dependencies that can be imported into other WORKSPACE files load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file", "http_jar") load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") -RULES_JVM_EXTERNAL_TAG = "4.2" -RULES_JVM_EXTERNAL_SHA = "cd1a77b7b02e8e008439ca76fd34f5b07aecb8c752961f9640dea15e9e5ba1ca" +RULES_JVM_EXTERNAL_TAG = "5.3" +RULES_JVM_EXTERNAL_SHA = "d31e369b854322ca5098ea12c69d7175ded971435e55c18dd9dd5f29cc5249ac" def archive_dependencies(third_party): return [ @@ -22,7 +22,7 @@ def archive_dependencies(third_party): "name": "rules_jvm_external", "strip_prefix": "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG, "sha256": RULES_JVM_EXTERNAL_SHA, - "url": "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG, + "url": "https://github.com/bazelbuild/rules_jvm_external/releases/download/%s/rules_jvm_external-%s.tar.gz" % (RULES_JVM_EXTERNAL_TAG, RULES_JVM_EXTERNAL_TAG), }, { "name": "rules_pkg", @@ -41,9 +41,9 @@ def archive_dependencies(third_party): # Needed for "well-known protos" and @com_google_protobuf//:protoc. { "name": "com_google_protobuf", - "sha256": "dd513a79c7d7e45cbaeaf7655289f78fd6b806e52dbbd7018ef4e3cf5cff697a", - "strip_prefix": "protobuf-3.15.8", - "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.15.8.zip"], + "sha256": "25f1292d4ea6666f460a2a30038eef121e6c3937ae0f61d610611dfb14b0bd32", + "strip_prefix": "protobuf-3.19.1", + "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.19.1.zip"], }, { "name": "com_github_bazelbuild_buildtools", @@ -97,9 +97,9 @@ def archive_dependencies(third_party): }, { "name": "rules_cc", - "sha256": "3d9e271e2876ba42e114c9b9bc51454e379cbf0ec9ef9d40e2ae4cec61a31b40", - "strip_prefix": "rules_cc-0.0.6", - "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.6/rules_cc-0.0.6.tar.gz", + "sha256": "2037875b9a4456dce4a79d112a8ae885bbc4aad968e6587dca6e64f3a0900cdf", + "strip_prefix": "rules_cc-0.0.9", + "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.9/rules_cc-0.0.9.tar.gz", }, # Used to format proto files diff --git a/examples/config.yml b/examples/config.yml index dd165a4d46..008bf2e1ca 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -3,6 +3,7 @@ defaultActionTimeout: 600 maximumActionTimeout: 3600 maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 prometheusPort: 9090 +allowSymlinkTargetAbsolute: false server: instanceType: SHARD name: shard @@ -22,6 +23,7 @@ server: dispatchedMonitorIntervalSeconds: 1 runOperationQueuer: true ensureOutputsPresent: false + runFailsafeOperation: true maxCpu: 0 maxRequeueAttempts: 5 useDenyList: true @@ -30,6 +32,7 @@ server: recordBesEvents: false clusterId: local cloudRegion: us-east-1 + gracefulShutdownSeconds: 0 caches: directoryCacheMaxEntries: 10000 commandCacheMaxEntries: 10000 @@ -70,8 +73,6 @@ backplane: operationChannelPrefix: "OperationChannel" casPrefix: "ContentAddressableStorage" casExpire: 604800 # 1 week - subscribeToBackplane: true - runFailsafeOperation: true maxQueueDepth: 100000 maxPreQueueDepth: 1000000 priorityQueue: false @@ -111,14 +112,14 @@ worker: skipLoad: false hexBucketLevels: 0 execRootCopyFallback: false - target: - publishTtlMetric: false + #- type: GRPC + # target: "grpc://host:port" executeStageWidth: 1 inputFetchStageWidth: 1 inputFetchDeadline: 60 linkInputDirectories: true - realInputDirectories: - - "external" + linkedInputDirectories: + - "(?!external)[^/]+" execOwner: defaultMaxCores: 0 limitGlobalExecution: false @@ -130,6 +131,7 @@ worker: alwaysUse: false selectForBlockNetwork: false selectForTmpFs: false + createSymlinkOutputs: false executionPolicies: - name: test executionWrapper: diff --git a/images.bzl b/images.bzl index 9ab0a8b0b7..939a752ed5 100644 --- a/images.bzl +++ b/images.bzl @@ -27,25 +27,9 @@ def buildfarm_images(): ) container_pull( - name = "ubuntu-bionic", - digest = "sha256:4bc527c7a288da405f2041928c63d0a6479a120ad63461c2f124c944def54be2", + name = "ubuntu-mantic", + digest = "sha256:1419bba15470a95242e917b3abcd8981ae36707b99df5ac705e1eee4d920c51c", registry = "index.docker.io", repository = "bazelbuild/buildfarm-worker-base", - tag = "bionic-java11-gcc", - ) - - container_pull( - name = "ubuntu-jammy", - digest = "sha256:da847ee259ebe7f00631a2f0146d9add60ff0f94b031a2e522ce94c78b1335c2", - registry = "index.docker.io", - repository = "bazelbuild/buildfarm-worker-base", - tag = "jammy-java11-gcc", - ) - - container_pull( - name = "amazon_corretto_java_image_base", - registry = "index.docker.io", - repository = "amazoncorretto", - tag = "19", - digest = "sha256:81d0df4412140416b27211c999e1f3c4565ae89a5cd92889475d20af422ba507", + tag = "mantic-java17-gcc", ) diff --git a/jvm_flags.bzl b/jvm_flags.bzl index 363f161465..440e0718aa 100644 --- a/jvm_flags.bzl +++ b/jvm_flags.bzl @@ -54,6 +54,12 @@ def ensure_accurate_metadata(): "//config:windows": ["-Dsun.nio.fs.ensureAccurateMetadata=true"], }) +def add_opens_sun_nio_fs(): + return select({ + "//conditions:default": [], + "//config:windows": ["--add-opens java.base/sun.nio.fs=ALL-UNNAMED"], + }) + def server_telemetry(): return select({ "//config:open_telemetry": SERVER_TELEMETRY_JVM_FLAGS, @@ -67,7 +73,7 @@ def worker_telemetry(): }) def server_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + server_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + server_telemetry() def worker_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + worker_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + worker_telemetry() diff --git a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java index a27b6a9e99..89f2e6a5be 100644 --- a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java +++ b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java @@ -10,12 +10,16 @@ public class JavaProcessWrapper extends ProcessWrapper { + // Get the path of the JVM from the current process to avoid breaking the Bazel sandbox + public static final String CURRENT_JVM_COMMAND = + ProcessHandle.current().info().command().orElseThrow(() -> new RuntimeException("Unable to retrieve the path of the running JVM")); + public JavaProcessWrapper( Path workDir, String classPath, String fullClassName, String[] args ) throws IOException { super(workDir, cmdArgs( new String[]{ - "java", + CURRENT_JVM_COMMAND, "-cp", classPath, fullClassName diff --git a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java index 0cdc68a7ff..9712394203 100644 --- a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java +++ b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java @@ -16,6 +16,7 @@ import persistent.bazel.client.PersistentWorker; import persistent.bazel.client.WorkerKey; +import persistent.common.processes.JavaProcessWrapper; import persistent.testutil.ProcessUtils; import persistent.testutil.WorkerUtils; @@ -55,7 +56,7 @@ public void endToEndAdder() throws Exception { ); ImmutableList initCmd = ImmutableList.of( - "java", + JavaProcessWrapper.CURRENT_JVM_COMMAND, "-cp", jarPath.toString(), "adder.Adder", diff --git a/src/main/java/build/buildfarm/BUILD b/src/main/java/build/buildfarm/BUILD index 601aa38eb4..3cbdeb5231 100644 --- a/src/main/java/build/buildfarm/BUILD +++ b/src/main/java/build/buildfarm/BUILD @@ -1,4 +1,4 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") package( default_visibility = ["//src:__subpackages__"], @@ -15,7 +15,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.server.BuildFarmServer", visibility = ["//visibility:public"], runtime_deps = [ @@ -29,7 +29,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.worker.shard.Worker", visibility = ["//visibility:public"], runtime_deps = [ diff --git a/src/main/java/build/buildfarm/admin/Admin.java b/src/main/java/build/buildfarm/admin/Admin.java deleted file mode 100644 index ca739f2347..0000000000 --- a/src/main/java/build/buildfarm/admin/Admin.java +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin; - -import build.buildfarm.v1test.GetHostsResult; - -public interface Admin { - void terminateHost(String hostId); - - void stopContainer(String hostId, String containerName); - - GetHostsResult getHosts(String filter, int ageInMinutes, String status); - - void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent); - - void disableHostScaleInProtection(String instanceName); - - void disableHostScaleInProtection(String clusterEndpoint, String instanceIp); -} diff --git a/src/main/java/build/buildfarm/admin/BUILD b/src/main/java/build/buildfarm/admin/BUILD deleted file mode 100644 index a5e481399e..0000000000 --- a/src/main/java/build/buildfarm/admin/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -java_library( - name = "admin", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java b/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java deleted file mode 100644 index f40dd72c95..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.aws; - -import static build.buildfarm.common.grpc.Channels.createChannel; - -import build.buildfarm.admin.Admin; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.Host; -import com.amazonaws.services.autoscaling.AmazonAutoScaling; -import com.amazonaws.services.autoscaling.AmazonAutoScalingClientBuilder; -import com.amazonaws.services.autoscaling.model.InstancesDistribution; -import com.amazonaws.services.autoscaling.model.MixedInstancesPolicy; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionRequest; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionResult; -import com.amazonaws.services.autoscaling.model.UpdateAutoScalingGroupRequest; -import com.amazonaws.services.ec2.AmazonEC2; -import com.amazonaws.services.ec2.AmazonEC2ClientBuilder; -import com.amazonaws.services.ec2.model.DescribeInstancesRequest; -import com.amazonaws.services.ec2.model.DescribeInstancesResult; -import com.amazonaws.services.ec2.model.Filter; -import com.amazonaws.services.ec2.model.Instance; -import com.amazonaws.services.ec2.model.Reservation; -import com.amazonaws.services.ec2.model.Tag; -import com.amazonaws.services.ec2.model.TerminateInstancesRequest; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagement; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagementClientBuilder; -import com.amazonaws.services.simplesystemsmanagement.model.SendCommandRequest; -import com.google.protobuf.util.Timestamps; -import io.grpc.ManagedChannel; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.springframework.stereotype.Component; - -@Log -@Component -public class AwsAdmin implements Admin { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private AmazonAutoScaling scale; - private AmazonEC2 ec2; - private AWSSimpleSystemsManagement ssm; - - public AwsAdmin() { - String region = configs.getServer().getCloudRegion(); - if (region != null) { - scale = AmazonAutoScalingClientBuilder.standard().withRegion(region).build(); - ec2 = AmazonEC2ClientBuilder.standard().withRegion(region).build(); - ssm = AWSSimpleSystemsManagementClientBuilder.standard().withRegion(region).build(); - } else { - log.warning("Missing cloudRegion configuration. AWS Admin will not be enabled."); - } - } - - @Override - public void terminateHost(String hostId) { - ec2.terminateInstances(new TerminateInstancesRequest().withInstanceIds(hostId)); - log.log(Level.INFO, String.format("Terminated host: %s", hostId)); - } - - @Override - public void stopContainer(String hostId, String containerName) { - String stopContainerCmd = - "docker ps | grep " + containerName + " | awk '{print $1 }' | xargs -I {} docker stop {}"; - Map> parameters = new HashMap<>(); - parameters.put("commands", Collections.singletonList(stopContainerCmd)); - ssm.sendCommand( - new SendCommandRequest() - .withDocumentName("AWS-RunShellScript") - .withInstanceIds(hostId) - .withParameters(parameters)); - log.log(Level.INFO, String.format("Stopped container: %s on host: %s", containerName, hostId)); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - GetHostsResult.Builder resultBuilder = GetHostsResult.newBuilder(); - List hosts = new ArrayList<>(); - DescribeInstancesResult instancesResult = - ec2.describeInstances( - new DescribeInstancesRequest() - .withFilters(new Filter().withName("tag-value").withValues(filter))); - long hostNum = 1L; - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - long uptime = getHostUptimeInMinutes(e.getLaunchTime()); - if (e.getPrivateIpAddress() != null - && uptime > ageInMinutes - && status.equalsIgnoreCase(e.getState().getName())) { - Host.Builder hostBuilder = Host.newBuilder(); - hostBuilder.setHostNum(hostNum++); - hostBuilder.setDnsName(e.getPrivateDnsName()); - hostBuilder.setHostId(e.getInstanceId()); - hostBuilder.setIpAddress(e.getPrivateIpAddress()); - hostBuilder.setLaunchTime(Timestamps.fromMillis(e.getLaunchTime().getTime())); - hostBuilder.setLifecycle( - e.getInstanceLifecycle() != null ? e.getInstanceLifecycle() : "on demand"); - hostBuilder.setNumCores(e.getCpuOptions().getCoreCount()); - hostBuilder.setState(e.getState().getName()); - hostBuilder.setType(e.getInstanceType()); - hostBuilder.setUptimeMinutes(uptime); - hosts.add(hostBuilder.build()); - } - } - } - resultBuilder.addAllHosts(hosts); - resultBuilder.setNumHosts(hosts.size()); - log.log(Level.FINER, String.format("Got %d hosts for filter: %s", hosts.size(), filter)); - return resultBuilder.build(); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - UpdateAutoScalingGroupRequest request = - new UpdateAutoScalingGroupRequest().withAutoScalingGroupName(scaleGroupName); - if (minHosts != null) { - request.setMinSize(minHosts); - } - if (maxHosts != null) { - request.setMaxSize(maxHosts); - } - if (targetHosts != null) { - request.setMaxSize(targetHosts); - } - if (targetReservedHostsPercent != null) { - request.setMixedInstancesPolicy( - new MixedInstancesPolicy() - .withInstancesDistribution( - new InstancesDistribution() - .withOnDemandPercentageAboveBaseCapacity(targetReservedHostsPercent))); - } - scale.updateAutoScalingGroup(request); - log.log(Level.INFO, String.format("Scaled: %s", scaleGroupName)); - } - - private long getHostUptimeInMinutes(Date launchTime) { - Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); - return (cal.getTime().getTime() - launchTime.getTime()) / 60000; - } - - /** - * Disable instance scale in protection so that auto scaler can shutdown the instance. - * - * @param privateDnsName the private Dns name of instance (i.e. ip-xx-xxx-xx-xx.ec2.internal) - */ - @Override - public void disableHostScaleInProtection(String privateDnsName) { - // 1 get AutoScalingGroup and InstanceId - Instance workerInstance = getInstanceId(privateDnsName); - if (workerInstance == null) { - String errorMessage = "Cannot find instance with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - String instanceId = workerInstance.getInstanceId(); - String autoScalingGroup = getTagValue(workerInstance.getTags()); - if (autoScalingGroup == null || autoScalingGroup.length() == 0) { - String errorMessage = - "Cannot find AutoScalingGroup name of worker with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - - // 2 disable scale in protection of the worker - SetInstanceProtectionRequest disableProtectionRequest = - new SetInstanceProtectionRequest() - .withInstanceIds(instanceId) - .withAutoScalingGroupName(autoScalingGroup) - .withProtectedFromScaleIn(false); - SetInstanceProtectionResult result = scale.setInstanceProtection(disableProtectionRequest); - log.log( - Level.INFO, - String.format( - "Disable protection of host: %s in AutoScalingGroup: %s and get result: %s", - instanceId, autoScalingGroup, result.toString())); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - ManagedChannel channel = null; - try { - channel = createChannel(clusterEndpoint); - AdminGrpc.AdminBlockingStub adminBlockingStub = AdminGrpc.newBlockingStub(channel); - adminBlockingStub.disableScaleInProtection( - DisableScaleInProtectionRequest.newBuilder().setInstanceName(instanceIp).build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - private String getTagValue(List tags) { - for (Tag tag : tags) { - if ("aws:autoscaling:groupName".equalsIgnoreCase(tag.getKey())) { - return tag.getValue(); - } - } - return null; - } - - private Instance getInstanceId(String privateDnsName) { - DescribeInstancesRequest describeInstancesRequest = - new DescribeInstancesRequest() - .withFilters(new Filter().withName("private-dns-name").withValues(privateDnsName)); - DescribeInstancesResult instancesResult = ec2.describeInstances(describeInstancesRequest); - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - if (e.getPrivateDnsName() != null && e.getPrivateDnsName().equals(privateDnsName)) { - return e; - } - } - } - return null; - } -} diff --git a/src/main/java/build/buildfarm/admin/aws/BUILD b/src/main/java/build/buildfarm/admin/aws/BUILD deleted file mode 100644 index a49e431bfd..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/common/grpc", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_autoscaling", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_ec2", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_ssm", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:io_grpc_grpc_api", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/BUILD b/src/main/java/build/buildfarm/admin/gcp/BUILD deleted file mode 100644 index 3d94b91f3f..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java b/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java deleted file mode 100644 index 34ee9a0163..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.gcp; - -import build.buildfarm.admin.Admin; -import build.buildfarm.v1test.GetHostsResult; -import org.springframework.stereotype.Component; - -@Component -public class GcpAdmin implements Admin { - @Override - public void terminateHost(String hostId) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void stopContainer(String hostId, String containerName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String instanceName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - throw new UnsupportedOperationException("Not Implemented"); - } -} diff --git a/src/main/java/build/buildfarm/backplane/Backplane.java b/src/main/java/build/buildfarm/backplane/Backplane.java index 513f3451ac..b1b05d5c42 100644 --- a/src/main/java/build/buildfarm/backplane/Backplane.java +++ b/src/main/java/build/buildfarm/backplane/Backplane.java @@ -282,4 +282,7 @@ boolean pollOperation(QueueEntry queueEntry, ExecutionStage.Value stage, long re Boolean propertiesEligibleForQueue(List provisions); GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest request) throws IOException; + + /** Set expiry time for digests */ + void updateDigestsExpiry(Iterable digests) throws IOException; } diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index 4a9aac68a8..d7fa82ba42 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -146,7 +146,19 @@ public abstract class CASFileCache implements ContentAddressableStorage { Gauge.build().name("cas_size").help("CAS size.").register(); private static final Gauge casEntryCountMetric = Gauge.build().name("cas_entry_count").help("Number of entries in the CAS.").register(); - private static Histogram casTtl; + private static Histogram casTtl = + Histogram.build() + .name("cas_ttl_s") + .buckets( + 3600, // 1 hour + 21600, // 6 hours + 86400, // 1 day + 345600, // 4 days + 604800, // 1 week + 1210000 // 2 weeks + ) + .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") + .register(); private static final Gauge casCopyFallbackMetric = Gauge.build() @@ -161,7 +173,6 @@ public abstract class CASFileCache implements ContentAddressableStorage { private final EntryPathStrategy entryPathStrategy; private final long maxSizeInBytes; private final long maxEntrySizeInBytes; - private final boolean publishTtlMetric; private final boolean execRootFallback; private final DigestUtil digestUtil; private final ConcurrentMap keyReferences; @@ -308,7 +319,6 @@ public CASFileCache( maxEntrySizeInBytes, config.getHexBucketLevels(), config.isFileDirectoriesIndexInMemory(), - config.isPublishTtlMetric(), config.isExecRootCopyFallback(), digestUtil, expireService, @@ -327,7 +337,6 @@ public CASFileCache( long maxEntrySizeInBytes, int hexBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -341,7 +350,6 @@ public CASFileCache( this.root = root; this.maxSizeInBytes = maxSizeInBytes; this.maxEntrySizeInBytes = maxEntrySizeInBytes; - this.publishTtlMetric = publishTtlMetric; this.execRootFallback = execRootFallback; this.digestUtil = digestUtil; this.expireService = expireService; @@ -353,23 +361,8 @@ public CASFileCache( this.delegateSkipLoad = delegateSkipLoad; this.directoriesIndexDbName = directoriesIndexDbName; this.keyReferences = Maps.newConcurrentMap(); - if (publishTtlMetric) { - casTtl = - Histogram.build() - .name("cas_ttl_s") - .buckets( - 3600, // 1 hour - 21600, // 6 hours - 86400, // 1 day - 345600, // 4 days - 604800, // 1 week - 1210000 // 2 weeks - ) - .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") - .register(); - } - - entryPathStrategy = new HexBucketEntryPathStrategy(root, hexBucketLevels); + + entryPathStrategy = new HexBucketEntryPathStrategy(root, hexBucketLevels); String directoriesIndexUrl = "jdbc:sqlite:"; if (directoriesIndexDbName.equals(DIRECTORIES_INDEX_NAME_MEMORY)) { @@ -888,6 +881,9 @@ public synchronized void reset() { + key.getIdentifier(), e); } finally { + if (closedFuture != null) { + closedFuture.set(null); + } isReset = true; } } @@ -2321,7 +2317,7 @@ private void getDirectoryKeys( } } - public ListenableFuture putDirectory( + public ListenableFuture putDirectory( Digest digest, Map directoriesIndex, ExecutorService service) { // Claim lock. // Claim the directory path so no other threads try to create/delete it. @@ -2337,7 +2333,7 @@ public ListenableFuture putDirectory( log.log(Level.FINER, format("locked directory %s", path.getFileName())); // Now that a lock has been claimed, we can proceed to create the directory. - ListenableFuture putFuture; + ListenableFuture putFuture; try { putFuture = putDirectorySynchronized(path, digest, directoriesIndex, service); } catch (IOException e) { @@ -2392,8 +2388,26 @@ private boolean directoryEntryExists( return false; } + public static class PathResult { + private final Path path; + private final boolean missed; + + public PathResult(Path path, boolean missed) { + this.path = path; + this.missed = missed; + } + + public Path getPath() { + return path; + } + + public boolean getMissed() { + return missed; + } + } + @SuppressWarnings("ConstantConditions") - private ListenableFuture putDirectorySynchronized( + private ListenableFuture putDirectorySynchronized( Path path, Digest digest, Map directoriesByDigest, ExecutorService service) throws IOException { log.log(Level.FINER, format("directory %s has been locked", path.getFileName())); @@ -2425,7 +2439,7 @@ private ListenableFuture putDirectorySynchronized( if (e != null) { log.log(Level.FINER, format("found existing entry for %s", path.getFileName())); if (directoryEntryExists(path, e, directoriesByDigest)) { - return immediateFuture(path); + return immediateFuture(new PathResult(path, /* missed=*/ false)); } log.log( Level.SEVERE, @@ -2558,7 +2572,7 @@ private ListenableFuture putDirectorySynchronized( : directoriesByDigest.get(digest), Deadline.after(10, SECONDS)); directoryStorage.put(digest, e); - return path; + return new PathResult(path, /* missed=*/ true); }, service); } @@ -2886,25 +2900,19 @@ private final void renamePath(Path a, Path b) throws IOException, FileAlreadyExi } private void deleteExpiredKey(String key) throws IOException { - // We don't want publishing the metric to delay the deletion of the file. - // We publish the metric only after the file has been deleted. - long createdTime = 0; Path path = getRemovingPath(key); - if (publishTtlMetric) { - createdTime = path.toFile().lastModified(); - } + long createdTimeMs = Files.getLastModifiedTime(path).to(MILLISECONDS); deleteFilePath(path); - if (publishTtlMetric) { - publishExpirationMetric(createdTime); - } + publishExpirationMetric(createdTimeMs); } - private void publishExpirationMetric(long createdTime) { - long currentTime = new Date().getTime(); - long ttl = currentTime - createdTime; - casTtl.observe(Time.millisecondsToSeconds(ttl)); + private void publishExpirationMetric(long createdTimeMs) { + // TODO introduce ttl clock + long currentTimeMs = new Date().getTime(); + long ttlMs = currentTimeMs - createdTimeMs; + casTtl.observe(Time.millisecondsToSeconds(ttlMs)); } @SuppressWarnings({"ConstantConditions", "ResultOfMethodCallIgnored"}) diff --git a/src/main/java/build/buildfarm/common/LoggingMain.java b/src/main/java/build/buildfarm/common/LoggingMain.java index 9a4d9bd2b0..4e60a6286a 100644 --- a/src/main/java/build/buildfarm/common/LoggingMain.java +++ b/src/main/java/build/buildfarm/common/LoggingMain.java @@ -7,24 +7,22 @@ public abstract class LoggingMain { protected abstract void onShutdown() throws InterruptedException; - class ShutdownThread extends Thread { - ShutdownThread(String applicationName) { - super(null, null, applicationName + "-Shutdown", 0); - } - - @Override - public void run() { - try { - LoggingMain.this.onShutdown(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } finally { - WaitingLogManager.release(); - } + private void shutdown() { + try { + onShutdown(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + WaitingLogManager.release(); } } protected LoggingMain(String applicationName) { - Runtime.getRuntime().addShutdownHook(new ShutdownThread(applicationName)); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + /* group=*/ null, + /* target=*/ this::shutdown, + /* name=*/ applicationName + "-Shutdown")); } } diff --git a/src/main/java/build/buildfarm/common/config/Backplane.java b/src/main/java/build/buildfarm/common/config/Backplane.java index d8e12de592..8cf6f41e8c 100644 --- a/src/main/java/build/buildfarm/common/config/Backplane.java +++ b/src/main/java/build/buildfarm/common/config/Backplane.java @@ -1,7 +1,10 @@ package build.buildfarm.common.config; -import com.google.common.base.Strings; +import java.util.ArrayList; +import java.util.List; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Backplane { @@ -32,8 +35,13 @@ public enum BACKPLANE_TYPE { private String operationChannelPrefix = "OperationChannel"; private String casPrefix = "ContentAddressableStorage"; private int casExpire = 604800; // 1 Week - private boolean subscribeToBackplane = true; - private boolean runFailsafeOperation = true; + + @Getter(AccessLevel.NONE) + private boolean subscribeToBackplane = true; // deprecated + + @Getter(AccessLevel.NONE) + private boolean runFailsafeOperation = true; // deprecated + private int maxQueueDepth = 100000; private int maxPreQueueDepth = 1000000; private boolean priorityQueue = false; @@ -47,13 +55,7 @@ public enum BACKPLANE_TYPE { private boolean cacheCas = false; private long priorityPollIntervalMillis = 100; - public String getRedisUri() { - // use environment override (useful for containerized deployment) - if (!Strings.isNullOrEmpty(System.getenv("REDIS_URI"))) { - return System.getenv("REDIS_URI"); - } - - // use configured value - return redisUri; - } + // These limited resources are shared across all workers. + // An example would be a limited number of seats to a license server. + private List resources = new ArrayList<>(); } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 68106d56c7..d6ebbe9e14 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -36,10 +36,10 @@ public final class BuildfarmConfigs { private long maximumActionTimeout = 3600; private long maxEntrySizeBytes = 2147483648L; // 2 * 1024 * 1024 * 1024 private int prometheusPort = 9090; + private boolean allowSymlinkTargetAbsolute = false; private Server server = new Server(); private Backplane backplane = new Backplane(); private Worker worker = new Worker(); - private WebUI ui = new WebUI(); private ExecutionWrappers executionWrappers = new ExecutionWrappers(); private BuildfarmConfigs() {} @@ -81,6 +81,9 @@ public static BuildfarmConfigs loadServerConfigs(String[] args) throws Configura if (options.prometheusPort >= 0) { buildfarmConfigs.setPrometheusPort(options.prometheusPort); } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustServerConfigs(buildfarmConfigs); return buildfarmConfigs; } @@ -100,6 +103,9 @@ public static BuildfarmConfigs loadWorkerConfigs(String[] args) throws Configura if (options.prometheusPort >= 0) { buildfarmConfigs.setPrometheusPort(options.prometheusPort); } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustWorkerConfigs(buildfarmConfigs); return buildfarmConfigs; } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java index defeebbf48..1dbe0c7f98 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java @@ -27,4 +27,10 @@ public class BuildfarmOptions extends OptionsBase { help = "Port for the prometheus service. '0' will disable prometheus hosting", defaultValue = "-1") public int prometheusPort; + + @Option( + name = "redis_uri", + help = "URI for Redis connection. Use 'redis://' or 'rediss://' for the scheme", + defaultValue = "") + public String redisUri; } diff --git a/src/main/java/build/buildfarm/common/config/Cas.java b/src/main/java/build/buildfarm/common/config/Cas.java index 5b16fb2168..9c671d4958 100644 --- a/src/main/java/build/buildfarm/common/config/Cas.java +++ b/src/main/java/build/buildfarm/common/config/Cas.java @@ -3,7 +3,9 @@ import com.google.common.base.Strings; import java.nio.file.Path; import javax.naming.ConfigurationException; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Cas { @@ -30,8 +32,8 @@ public enum TYPE { private String target; private boolean readonly = false; - // Metrics - private boolean publishTtlMetric = false; + @Getter(AccessLevel.NONE) + private boolean publishTtlMetric = false; // deprecated public Path getValidPath(Path root) throws ConfigurationException { if (Strings.isNullOrEmpty(path)) { diff --git a/src/main/java/build/buildfarm/common/config/LimitedResource.java b/src/main/java/build/buildfarm/common/config/LimitedResource.java new file mode 100644 index 0000000000..f3b09ff621 --- /dev/null +++ b/src/main/java/build/buildfarm/common/config/LimitedResource.java @@ -0,0 +1,43 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.config; + +import lombok.Data; + +/** + * @class Limited Resource + * @brief A fixed amount of a specific resource. + * @details We define a limited resource as a counting semaphore whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. + */ +@Data +public class LimitedResource { + /** + * @field name + * @brief The name of the resource. + * @details This should correspond to the platform property's key name: + * resources:: + */ + public String name; + + /** + * @field amount + * @brief The total amount of the resource that's available for use during execution. + * @details As a counting semaphore, this amount becomes the limit. + */ + public int amount = 1; +} diff --git a/src/main/java/build/buildfarm/common/config/Metrics.java b/src/main/java/build/buildfarm/common/config/Metrics.java index 73a9113d7a..28e065ddbf 100644 --- a/src/main/java/build/buildfarm/common/config/Metrics.java +++ b/src/main/java/build/buildfarm/common/config/Metrics.java @@ -1,6 +1,8 @@ package build.buildfarm.common.config; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Metrics { @@ -19,7 +21,9 @@ public enum LOG_LEVEL { FINEST, } - private PUBLISHER publisher = PUBLISHER.LOG; + @Getter(AccessLevel.NONE) + private PUBLISHER publisher = PUBLISHER.LOG; // deprecated + private LOG_LEVEL logLevel = LOG_LEVEL.FINEST; private String topic; private int topicMaxConnections; diff --git a/src/main/java/build/buildfarm/common/config/Server.java b/src/main/java/build/buildfarm/common/config/Server.java index e169c2575b..0487154c2f 100644 --- a/src/main/java/build/buildfarm/common/config/Server.java +++ b/src/main/java/build/buildfarm/common/config/Server.java @@ -24,6 +24,7 @@ public enum INSTANCE_TYPE { private String sslPrivateKeyPath = null; private boolean runDispatchedMonitor = true; private int dispatchedMonitorIntervalSeconds = 1; + private boolean runFailsafeOperation = true; private boolean runOperationQueuer = true; private boolean ensureOutputsPresent = false; private int maxRequeueAttempts = 5; @@ -41,6 +42,7 @@ public enum INSTANCE_TYPE { private int maxInboundMetadataSize = 0; private ServerCacheConfigs caches = new ServerCacheConfigs(); private boolean findMissingBlobsViaBackplane = false; + private int gracefulShutdownSeconds = 0; public String getSession() { return String.format("buildfarm-server-%s-%s", getPublicName(), sessionGuid); diff --git a/src/main/java/build/buildfarm/common/config/WebUI.java b/src/main/java/build/buildfarm/common/config/WebUI.java deleted file mode 100644 index 443d210691..0000000000 --- a/src/main/java/build/buildfarm/common/config/WebUI.java +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.common.config; - -import lombok.Data; - -/** - * @class WebUI - * @brief Settings for buildfarm's web UI. - * @details Buildfarm provides a web frontend for developers to introspect builds. - */ -@Data -public class WebUI { - /** - * @field enable - * @brief Whether to enable the web frontend. - * @details When disabled there will be no ports opened or routes available. - */ - public boolean enable = false; - - /** - * @field port - * @brief HTTP port for the web frontend. - * @details 8080 is useful for local testing since port 80 requires sudo. We choose the following - * default since many ports are blocked in upstream CI. - */ - public String port = "8982"; -} diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index 4caaff2b02..c446134fe2 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -5,6 +5,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.naming.ConfigurationException; @@ -28,7 +29,7 @@ public class Worker { private int inputFetchStageWidth = 0; private int inputFetchDeadline = 60; private boolean linkInputDirectories = true; - private List realInputDirectories = Arrays.asList("external"); + private List linkedInputDirectories = Arrays.asList("(?!external)[^/]+"); private String execOwner; private int defaultMaxCores = 0; private boolean limitGlobalExecution = false; @@ -38,6 +39,12 @@ public class Worker { private int gracefulShutdownSeconds = 0; private ExecutionPolicy[] executionPolicies = {}; private SandboxSettings sandboxSettings = new SandboxSettings(); + private boolean createSymlinkOutputs = false; + + // These limited resources are only for the individual worker. + // An example would be hardware resources such as GPUs. + // If you want GPU actions to run exclusively, define a single GPU resource. + private List resources = new ArrayList<>(); public ExecutionPolicy[] getExecutionPolicies() { if (executionPolicies != null) { diff --git a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java index 26b67f5c6c..4e500398b2 100644 --- a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java +++ b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java @@ -227,6 +227,14 @@ public void onNext(WriteResponse response) { @Override public void onError(Throwable t) { + log.log( + WARNING, + format( + "%s: write(%s) on worker %s after %d bytes of content", + Status.fromThrowable(t).getCode().name(), + resourceName, + bsStub.get().getChannel().authority(), + writtenBytes)); writeFuture.setException(exceptionTranslator.apply(t)); } diff --git a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java index 52f937c8ed..9123f5eaf9 100644 --- a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java +++ b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java @@ -236,6 +236,17 @@ public String dequeue(JedisCluster jedis) throws InterruptedException { } } + /** + * @brief Pop element into internal dequeue and return value. + * @details Null is returned if the queue is empty. + * @return The value of the transfered element. null if queue is empty or thread was interrupted. + * @note Suggested return identifier: val. + */ + public String nonBlockingDequeue(JedisCluster jedis) throws InterruptedException { + QueueInterface queue = queues.get(roundRobinPopIndex()); + return queue.nonBlockingDequeue(jedis); + } + /** * @brief Get the current pop queue. * @details Get the queue that the balanced queue intends to pop from next. diff --git a/src/main/java/build/buildfarm/instance/MatchListener.java b/src/main/java/build/buildfarm/instance/MatchListener.java index 46f6dfc000..3da7df90eb 100644 --- a/src/main/java/build/buildfarm/instance/MatchListener.java +++ b/src/main/java/build/buildfarm/instance/MatchListener.java @@ -27,7 +27,4 @@ public interface MatchListener { boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException; void onError(Throwable t); - - // method that should be called when this match is cancelled and no longer valid - void setOnCancelHandler(Runnable onCancelHandler); } diff --git a/src/main/java/build/buildfarm/instance/server/BUILD b/src/main/java/build/buildfarm/instance/server/BUILD index ecff968e79..1c63e9896c 100644 --- a/src/main/java/build/buildfarm/instance/server/BUILD +++ b/src/main/java/build/buildfarm/instance/server/BUILD @@ -1,8 +1,8 @@ java_library( name = "server", srcs = [ - "AbstractServerInstance.java", "GetDirectoryFunction.java", + "NodeInstance.java", "OperationsMap.java", "WatchFuture.java", ], diff --git a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java b/src/main/java/build/buildfarm/instance/server/NodeInstance.java similarity index 98% rename from src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java rename to src/main/java/build/buildfarm/instance/server/NodeInstance.java index f0f9b4e9b7..c3cfe4d97e 100644 --- a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java +++ b/src/main/java/build/buildfarm/instance/server/NodeInstance.java @@ -145,7 +145,7 @@ import lombok.extern.java.Log; @Log -public abstract class AbstractServerInstance implements Instance { +public abstract class NodeInstance implements Instance { private final String name; protected final ContentAddressableStorage contentAddressableStorage; protected final ActionCache actionCache; @@ -179,6 +179,8 @@ public abstract class AbstractServerInstance implements Instance { public static final String ENVIRONMENT_VARIABLES_NOT_SORTED = "The `Command`'s `environment_variables` are not correctly sorted by `name`."; + public static final String SYMLINK_TARGET_ABSOLUTE = "A symlink target is absolute."; + public static final String MISSING_ACTION = "The action was not found in the CAS."; public static final String MISSING_COMMAND = "The command was not found in the CAS."; @@ -228,7 +230,7 @@ public abstract class AbstractServerInstance implements Instance { public static final String NO_REQUEUE_COMPLETE_MESSAGE = "Operation %s not requeued. Operation has already completed."; - public AbstractServerInstance( + public NodeInstance( String name, DigestUtil digestUtil, ContentAddressableStorage contentAddressableStorage, @@ -802,6 +804,7 @@ public static void validateActionInputDirectory( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -850,6 +853,14 @@ public static void validateActionInputDirectory( .setSubject("/" + directoryPath + ": " + lastSymlinkName + " > " + symlinkName) .setDescription(DIRECTORY_NOT_SORTED); } + String symlinkTarget = symlinkNode.getTarget(); + if (!allowSymlinkTargetAbsolute && symlinkTarget.charAt(0) == '/') { + preconditionFailure + .addViolationsBuilder() + .setType(VIOLATION_TYPE_INVALID) + .setSubject("/" + directoryPath + ": " + symlinkName + " -> " + symlinkTarget) + .setDescription(SYMLINK_TARGET_ABSOLUTE); + } /* FIXME serverside validity check? regex? Preconditions.checkState( isValidFilename(symlinkName), @@ -919,6 +930,7 @@ public static void validateActionInputDirectory( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -934,6 +946,7 @@ private static void validateActionInputDirectoryDigest( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -958,6 +971,7 @@ private static void validateActionInputDirectoryDigest( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -1215,12 +1229,16 @@ protected void validateAction( ImmutableSet.Builder inputFilesBuilder = ImmutableSet.builder(); inputDirectoriesBuilder.add(ACTION_INPUT_ROOT_DIRECTORY_PATH); + boolean allowSymlinkTargetAbsolute = + getCacheCapabilities().getSymlinkAbsolutePathStrategy() + == SymlinkAbsolutePathStrategy.Value.ALLOWED; validateActionInputDirectoryDigest( ACTION_INPUT_ROOT_DIRECTORY_PATH, action.getInputRootDigest(), new Stack<>(), new HashSet<>(), directoriesIndex, + allowSymlinkTargetAbsolute, inputFilesBuilder::add, inputDirectoriesBuilder::add, onInputDigest, @@ -1961,19 +1979,18 @@ public ServerCapabilities getCapabilities() { @Override public WorkerProfileMessage getWorkerProfile() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerProfile() method."); + "NodeInstance doesn't support getWorkerProfile() method."); } @Override public WorkerListMessage getWorkerList() { - throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerList() method."); + throw new UnsupportedOperationException("NodeInstance doesn't support getWorkerList() method."); } @Override public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support drainWorkerPipeline() method."); + "NodeInstance doesn't support drainWorkerPipeline() method."); } @Override diff --git a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java index 794b296a1f..55b2933e42 100644 --- a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java @@ -121,4 +121,11 @@ Map> getMap(RedisClient client, Iterable blobDigests * @note Suggested return identifier: mapSize. */ int size(RedisClient client) throws IOException; + + /** + * @brief Set the expiry duration for the digests. + * @param client Client used for interacting with redis when not using cacheMap. + * @param blobDigests The blob digests to set new the expiry duration. + */ + void setExpire(RedisClient client, Iterable blobDigests) throws IOException; } diff --git a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java index d035d10491..65d1f06b87 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java @@ -234,6 +234,17 @@ public int size(RedisClient client) throws IOException { return client.call(jedis -> ScanCount.get(jedis, name + ":*", 1000)); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) throws IOException { + client.run( + jedis -> { + for (Digest blobDigest : blobDigests) { + String key = redisCasKey(blobDigest); + jedis.expire(key, keyExpiration_s); + } + }); + } + /** * @brief Get the redis key name. * @details This is to be used for the direct redis implementation. diff --git a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java index 5f85da29f6..2bfb48ac9e 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java @@ -58,9 +58,7 @@ public static Supplier create(String identifier) throws Configurat list2Set(redisNodes), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } @@ -70,9 +68,7 @@ public static Supplier create(String identifier) throws Configurat parseUri(configs.getBackplane().getRedisUri()), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } diff --git a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java index 64f1f2befa..78dd77c769 100644 --- a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java +++ b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java @@ -24,6 +24,7 @@ import com.google.common.collect.SetMultimap; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import redis.clients.jedis.JedisCluster; /** @@ -48,6 +49,14 @@ public class OperationQueue { */ private final List queues; + /** + * @field currentDequeueIndex + * @brief The current queue index to dequeue from. + * @details Used in a round-robin fashion to ensure an even distribution of dequeues across + * matched queues. + */ + private int currentDequeueIndex = 0; + /** * @brief Constructor. * @details Construct the operation queue with various provisioned redis queues. @@ -186,8 +195,18 @@ public void push( */ public String dequeue(JedisCluster jedis, List provisions) throws InterruptedException { - BalancedRedisQueue queue = chooseEligibleQueue(provisions); - return queue.dequeue(jedis); + // Select all matched queues, and attempt dequeuing via round-robin. + List queues = chooseEligibleQueues(provisions); + int index = roundRobinPopIndex(queues); + String value = queues.get(index).nonBlockingDequeue(jedis); + + // Keep iterating over matched queues until we find one that is non-empty and provides a + // dequeued value. + while (value == null) { + index = roundRobinPopIndex(queues); + value = queues.get(index).nonBlockingDequeue(jedis); + } + return value; } /** @@ -270,6 +289,39 @@ private BalancedRedisQueue chooseEligibleQueue(List provision } } + throwNoEligibleQueueException(provisions); + return null; + } + + /** + * @brief Choose an eligible queues based on given properties. + * @details We use the platform execution properties of a queue entry to determine the appropriate + * queues. If there no eligible queues, an exception is thrown. + * @param provisions Provisions to check that requirements are met. + * @return The chosen queues. + * @note Suggested return identifier: queues. + */ + private List chooseEligibleQueues(List provisions) { + List eligibleQueues = + queues.stream() + .filter(provisionedQueue -> provisionedQueue.isEligible(toMultimap(provisions))) + .map(provisionedQueue -> provisionedQueue.queue()) + .collect(Collectors.toList()); + + if (eligibleQueues.isEmpty()) { + throwNoEligibleQueueException(provisions); + } + + return eligibleQueues; + } + + /** + * @brief Throw an exception that explains why there are no eligible queues. + * @details This function should only be called, when there were no matched queues. + * @param provisions Provisions to check that requirements are met. + * @return no return. + */ + private void throwNoEligibleQueueException(List provisions) { // At this point, we were unable to match an action to an eligible queue. // We will build an error explaining why the matching failed. This will help user's properly // configure their queue or adjust the execution_properties of their actions. @@ -286,6 +338,34 @@ private BalancedRedisQueue chooseEligibleQueue(List provision + eligibilityResults); } + /** + * @brief Get the current queue index for round-robin dequeues. + * @details Adjusts the round-robin index for next call. + * @param matchedQueues The queues to round robin. + * @return The current round-robin index. + * @note Suggested return identifier: queueIndex. + */ + private int roundRobinPopIndex(List matchedQueues) { + int currentIndex = currentDequeueIndex; + currentDequeueIndex = nextQueueInRoundRobin(currentDequeueIndex, matchedQueues); + return currentIndex; + } + + /** + * @brief Get the next queue in the round robin. + * @details If we are currently on the last queue it becomes the first queue. + * @param index Current queue index. + * @param matchedQueues The queues to round robin. + * @return And adjusted val based on the current queue index. + * @note Suggested return identifier: adjustedCurrentQueue. + */ + private int nextQueueInRoundRobin(int index, List matchedQueues) { + if (index >= matchedQueues.size() - 1) { + return 0; + } + return index + 1; + } + /** * @brief Convert proto provisions into java multimap. * @details This conversion is done to more easily check if a key/value exists in the provisions. diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index a0d96ea081..68da71f811 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -129,6 +129,8 @@ public class RedisShardBackplane implements Backplane { .build()); private final String source; // used in operation change publication + private final boolean subscribeToBackplane; + private final boolean runFailsafeOperation; private final Function onPublish; private final Function onComplete; private final Supplier jedisClusterFactory; @@ -149,18 +151,30 @@ public class RedisShardBackplane implements Backplane { public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete) throws ConfigurationException { - this(source, onPublish, onComplete, JedisClusterFactory.create(source)); + this( + source, + subscribeToBackplane, + runFailsafeOperation, + onPublish, + onComplete, + JedisClusterFactory.create(source)); } public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete, Supplier jedisClusterFactory) { this.source = source; + this.subscribeToBackplane = subscribeToBackplane; + this.runFailsafeOperation = runFailsafeOperation; this.onPublish = onPublish; this.onComplete = onComplete; this.jedisClusterFactory = jedisClusterFactory; @@ -520,10 +534,10 @@ public void start(String clientPublicName) throws IOException { // Create containers that make up the backplane state = DistributedStateCreator.create(client); - if (configs.getBackplane().isSubscribeToBackplane()) { + if (subscribeToBackplane) { startSubscriptionThread(); } - if (configs.getBackplane().isRunFailsafeOperation()) { + if (runFailsafeOperation) { startFailsafeOperationThread(); } @@ -1500,4 +1514,9 @@ public GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest req } return GetClientStartTimeResult.newBuilder().addAllClientStartTime(startTimes).build(); } + + @Override + public void updateDigestsExpiry(Iterable digests) throws IOException { + state.casWorkerMap.setExpire(client, digests); + } } diff --git a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java index 234e15fb15..52b010c31f 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java @@ -209,6 +209,14 @@ public int size(RedisClient client) { return cacheMap.size(); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) { + for (Digest blobDigest : blobDigests) { + String key = cacheMapCasKey(blobDigest); + cacheMap.expireKey(key, keyExpiration_s, TimeUnit.SECONDS); + } + } + /** * @brief Get a random element from the set. * @details Assumes the set is not empty. diff --git a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java index 640878707f..8a00e9ae96 100644 --- a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java @@ -30,7 +30,7 @@ import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance.WorkersCallback; +import build.buildfarm.instance.shard.ServerInstance.WorkersCallback; import com.google.common.base.Throwables; import com.google.common.cache.LoadingCache; import com.google.common.collect.Iterables; diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java similarity index 96% rename from src/main/java/build/buildfarm/instance/shard/ShardInstance.java rename to src/main/java/build/buildfarm/instance/shard/ServerInstance.java index cc0ad69737..29edc29fd4 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java @@ -47,6 +47,7 @@ import build.bazel.remote.execution.v2.Action; import build.bazel.remote.execution.v2.ActionResult; import build.bazel.remote.execution.v2.BatchReadBlobsResponse.Response; +import build.bazel.remote.execution.v2.CacheCapabilities; import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Compressor; import build.bazel.remote.execution.v2.Digest; @@ -61,6 +62,7 @@ import build.bazel.remote.execution.v2.Platform.Property; import build.bazel.remote.execution.v2.RequestMetadata; import build.bazel.remote.execution.v2.ResultsCachePolicy; +import build.bazel.remote.execution.v2.SymlinkAbsolutePathStrategy; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.actioncache.ShardActionCache; import build.buildfarm.backplane.Backplane; @@ -80,7 +82,7 @@ import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.Instance; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -136,10 +138,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.time.Instant; +import java.util.AbstractMap; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.Deque; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -168,7 +172,7 @@ import lombok.extern.java.Log; @Log -public class ShardInstance extends AbstractServerInstance { +public class ServerInstance extends NodeInstance { private static final ListenableFuture IMMEDIATE_VOID_FUTURE = Futures.immediateFuture(null); private static final String TIMEOUT_OUT_OF_BOUNDS = @@ -257,13 +261,17 @@ public class ShardInstance extends AbstractServerInstance { private static Backplane createBackplane(String identifier) throws ConfigurationException { if (configs.getBackplane().getType().equals(SHARD)) { return new RedisShardBackplane( - identifier, ShardInstance::stripOperation, ShardInstance::stripQueuedOperation); + identifier, + /* subscribeToBackplane=*/ true, + configs.getServer().isRunFailsafeOperation(), + ServerInstance::stripOperation, + ServerInstance::stripQueuedOperation); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); } } - public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) + public ServerInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) throws InterruptedException, ConfigurationException { this( name, @@ -273,7 +281,7 @@ public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runn /* actionCacheFetchService=*/ BuildfarmExecutors.getActionCacheFetchServicePool()); } - private ShardInstance( + private ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -325,7 +333,7 @@ void initializeCaches() { .build(); } - public ShardInstance( + public ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -572,10 +580,12 @@ public void stop() throws InterruptedException { stopping = true; log.log(Level.FINER, format("Instance %s is stopping", getName())); if (operationQueuer != null) { - operationQueuer.stop(); + operationQueuer.interrupt(); + operationQueuer.join(); } if (dispatchedMonitor != null) { dispatchedMonitor.interrupt(); + dispatchedMonitor.join(); } if (prometheusMetricsThread != null) { prometheusMetricsThread.interrupt(); @@ -656,7 +666,7 @@ public ListenableFuture> findMissingBlobs( } if (configs.getServer().isFindMissingBlobsViaBackplane()) { - return findMissingBlobsViaBackplane(nonEmptyDigests); + return findMissingBlobsViaBackplane(nonEmptyDigests, requestMetadata); } return findMissingBlobsQueryingEachWorker(nonEmptyDigests, requestMetadata); @@ -723,24 +733,40 @@ private ListenableFuture> findMissingBlobsQueryingEachWorker( // out-of-date and the server lies about which blobs are actually present. We provide this // alternative strategy for calculating missing blobs. private ListenableFuture> findMissingBlobsViaBackplane( - Iterable nonEmptyDigests) { + Iterable nonEmptyDigests, RequestMetadata requestMetadata) { try { Set uniqueDigests = new HashSet<>(); nonEmptyDigests.forEach(uniqueDigests::add); Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); Set workerSet = backplane.getStorageWorkers(); Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); - return immediateFuture( + Map> digestAndWorkersMap = uniqueDigests.stream() - .filter( // best effort to present digests only missing on active workers + .map( digest -> { Set initialWorkers = foundBlobs.getOrDefault(digest, Collections.emptySet()); - return filterAndAdjustWorkersForDigest( - digest, initialWorkers, workerSet, workersStartTime) - .isEmpty(); + return new AbstractMap.SimpleEntry<>( + digest, + filterAndAdjustWorkersForDigest( + digest, initialWorkers, workerSet, workersStartTime)); }) - .collect(Collectors.toList())); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + ListenableFuture> missingDigestFuture = + immediateFuture( + digestAndWorkersMap.entrySet().stream() + .filter(entry -> entry.getValue().isEmpty()) + .map(Map.Entry::getKey) + .collect(Collectors.toList())); + return transformAsync( + missingDigestFuture, + (missingDigest) -> { + extendLeaseForDigests(digestAndWorkersMap, requestMetadata); + return immediateFuture(missingDigest); + }, + // Propagate context values but don't cascade its cancellation for downstream calls. + Context.current().fork().fixedContextExecutor(directExecutor())); } catch (Exception e) { log.log(Level.SEVERE, "find missing blob via backplane failed", e); return immediateFailedFuture(Status.fromThrowable(e).asException()); @@ -785,6 +811,29 @@ private Set filterAndAdjustWorkersForDigest( return workersStartedBeforeDigestInsertion; } + private void extendLeaseForDigests( + Map> digestAndWorkersMap, RequestMetadata requestMetadata) { + Map> workerAndDigestMap = new HashMap<>(); + digestAndWorkersMap.forEach( + (digest, workers) -> + workers.forEach( + worker -> + workerAndDigestMap.computeIfAbsent(worker, w -> new HashSet<>()).add(digest))); + + workerAndDigestMap.forEach( + (worker, digests) -> workerStub(worker).findMissingBlobs(digests, requestMetadata)); + + try { + backplane.updateDigestsExpiry(digestAndWorkersMap.keySet()); + } catch (IOException e) { + log.log( + Level.WARNING, + format( + "Failed to update expiry duration for digests (%s) insertion time", + digestAndWorkersMap.keySet())); + } + } + private void findMissingBlobsOnWorker( String requestId, Iterable blobDigests, @@ -1082,7 +1131,7 @@ public void onError(Throwable t) { backplane, workerSet, locationSet, - ShardInstance.this::workerStub, + ServerInstance.this::workerStub, blobDigest, directExecutor(), RequestMetadata.getDefaultInstance()), @@ -2285,7 +2334,7 @@ public ListenableFuture queue(ExecuteEntry executeEntry, Poller poller, Du log.log( Level.FINER, format( - "ShardInstance(%s): checkCache(%s): %sus elapsed", + "ServerInstance(%s): checkCache(%s): %sus elapsed", getName(), operation.getName(), checkCacheUSecs)); return IMMEDIATE_VOID_FUTURE; } @@ -2312,7 +2361,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): fetching action %s", + "ServerInstance(%s): queue(%s): fetching action %s", getName(), operation.getName(), actionDigest.getHash())); RequestMetadata requestMetadata = executeEntry.getRequestMetadata(); ListenableFuture actionFuture = @@ -2355,7 +2404,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): fetched action %s transforming queuedOperation", + "ServerInstance(%s): queue(%s): fetched action %s transforming queuedOperation", getName(), operation.getName(), actionDigest.getHash())); Stopwatch transformStopwatch = Stopwatch.createStarted(); return transform( @@ -2385,7 +2434,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s transformed, validating", + "ServerInstance(%s): queue(%s): queuedOperation %s transformed, validating", getName(), operation.getName(), DigestUtil.toString( @@ -2407,7 +2456,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s validated, uploading", + "ServerInstance(%s): queue(%s): queuedOperation %s validated, uploading", getName(), operation.getName(), DigestUtil.toString( @@ -2459,7 +2508,7 @@ public void onSuccess(ProfiledQueuedOperationMetadata profiledQueuedMetadata) { log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", + "ServerInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", getName(), queueOperation.getName(), checkCacheUSecs, @@ -2762,4 +2811,16 @@ private boolean inDenyList(RequestMetadata requestMetadata) throws IOException { } return backplane.isBlacklisted(requestMetadata); } + + @Override + protected CacheCapabilities getCacheCapabilities() { + SymlinkAbsolutePathStrategy.Value symlinkAbsolutePathStrategy = + configs.isAllowSymlinkTargetAbsolute() + ? SymlinkAbsolutePathStrategy.Value.ALLOWED + : SymlinkAbsolutePathStrategy.Value.DISALLOWED; + return super.getCacheCapabilities() + .toBuilder() + .setSymlinkAbsolutePathStrategy(symlinkAbsolutePathStrategy) + .build(); + } } diff --git a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java deleted file mode 100644 index e0ae2785e0..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.aws; - -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; -import com.amazonaws.ClientConfiguration; -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.handlers.AsyncHandler; -import com.amazonaws.services.secretsmanager.AWSSecretsManager; -import com.amazonaws.services.secretsmanager.AWSSecretsManagerClientBuilder; -import com.amazonaws.services.secretsmanager.model.GetSecretValueRequest; -import com.amazonaws.services.secretsmanager.model.GetSecretValueResult; -import com.amazonaws.services.sns.AmazonSNSAsync; -import com.amazonaws.services.sns.AmazonSNSAsyncClientBuilder; -import com.amazonaws.services.sns.model.PublishRequest; -import com.amazonaws.services.sns.model.PublishResult; -import com.amazonaws.util.StringUtils; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.longrunning.Operation; -import java.io.IOException; -import java.util.Base64; -import java.util.HashMap; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AwsMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static AmazonSNSAsync snsClient; - - private final String snsTopicOperations; - private String accessKeyId = null; - private String secretKey = null; - private final String region; - private final int snsClientMaxConnections; - - public AwsMetricsPublisher() { - super(configs.getServer().getClusterId()); - snsTopicOperations = configs.getServer().getMetrics().getTopic(); - region = configs.getServer().getCloudRegion(); - getAwsSecret(configs.getServer().getMetrics().getSecretName()); - snsClientMaxConnections = configs.getServer().getMetrics().getTopicMaxConnections(); - if (!StringUtils.isNullOrEmpty(snsTopicOperations) - && snsClientMaxConnections > 0 - && !StringUtils.isNullOrEmpty(accessKeyId) - && !StringUtils.isNullOrEmpty(secretKey) - && !StringUtils.isNullOrEmpty(region)) { - snsClient = initSnsClient(); - } - } - - @Override - public void publishRequestMetadata(Operation operation, RequestMetadata requestMetadata) { - try { - if (snsClient != null) { - snsClient.publishAsync( - new PublishRequest( - snsTopicOperations, - formatRequestMetadataToJson(populateRequestMetadata(operation, requestMetadata))), - new AsyncHandler() { - @Override - public void onError(Exception e) { - log.log(Level.WARNING, "Could not publish metrics data to SNS.", e); - } - - @Override - public void onSuccess(PublishRequest request, PublishResult publishResult) {} - }); - } - } catch (Exception e) { - log.log( - Level.WARNING, - String.format("Could not publish request metadata to SNS for %s.", operation.getName()), - e); - } - } - - private AmazonSNSAsync initSnsClient() { - log.log(Level.INFO, "Initializing SNS Client."); - return AmazonSNSAsyncClientBuilder.standard() - .withRegion(region) - .withClientConfiguration( - new ClientConfiguration().withMaxConnections(snsClientMaxConnections)) - .withCredentials( - new AWSStaticCredentialsProvider( - new AWSCredentials() { - @Override - public String getAWSAccessKeyId() { - return accessKeyId; - } - - @Override - public String getAWSSecretKey() { - return secretKey; - } - })) - .build(); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } - - @SuppressWarnings("unchecked") - private void getAwsSecret(String secretName) { - AWSSecretsManager client = AWSSecretsManagerClientBuilder.standard().withRegion(region).build(); - GetSecretValueRequest getSecretValueRequest = - new GetSecretValueRequest().withSecretId(secretName); - GetSecretValueResult getSecretValueResult; - try { - getSecretValueResult = client.getSecretValue(getSecretValueRequest); - } catch (Exception e) { - log.log(Level.SEVERE, String.format("Could not get secret %s from AWS.", secretName)); - return; - } - String secret; - if (getSecretValueResult.getSecretString() != null) { - secret = getSecretValueResult.getSecretString(); - } else { - secret = - new String(Base64.getDecoder().decode(getSecretValueResult.getSecretBinary()).array()); - } - - if (secret != null) { - try { - final ObjectMapper objectMapper = new ObjectMapper(); - final HashMap secretMap = objectMapper.readValue(secret, HashMap.class); - accessKeyId = secretMap.get("access_key"); - secretKey = secretMap.get("secret_key"); - } catch (IOException e) { - log.log(Level.SEVERE, String.format("Could not parse secret %s from AWS", secretName)); - } - } - } -} diff --git a/src/main/java/build/buildfarm/metrics/aws/BUILD b/src/main/java/build/buildfarm/metrics/aws/BUILD deleted file mode 100644 index 51d44ea8c9..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", - "@maven//:com_fasterxml_jackson_core_jackson_databind", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_projectlombok_lombok", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/BUILD b/src/main/java/build/buildfarm/metrics/gcp/BUILD deleted file mode 100644 index 765902b905..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java deleted file mode 100644 index 40a6ddb1f8..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.gcp; - -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; - -public class GcpMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public GcpMetricsPublisher() { - super(configs.getServer().getClusterId()); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } -} diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index b05d2e5ef7..2cd750a4ff 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -14,9 +14,10 @@ java_library( "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/shard", "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/server/controllers:WebController", "//src/main/java/build/buildfarm/server/services", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@maven//:com_github_pcj_google_options", "@maven//:com_google_guava_guava", "@maven//:io_grpc_grpc_api", @@ -26,13 +27,5 @@ java_library( "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_bouncycastle_bcprov_jdk15on", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_thymeleaf", - "@maven//:org_springframework_boot_spring_boot_starter_web", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", ], ) diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index d0d26a6f52..c448963209 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -15,30 +15,29 @@ package build.buildfarm.server; import static build.buildfarm.common.io.Utils.formatIOError; -import static com.google.common.base.Preconditions.checkState; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; +import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.logging.Level.SEVERE; +import static java.util.logging.Level.WARNING; import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.GrpcMetrics; import build.buildfarm.common.grpc.TracingMetadataUtils.ServerHeadersInterceptor; import build.buildfarm.common.services.ByteStreamService; import build.buildfarm.common.services.ContentAddressableStorageService; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance; +import build.buildfarm.instance.shard.ServerInstance; import build.buildfarm.metrics.prometheus.PrometheusPublisher; -import build.buildfarm.server.controllers.WebController; import build.buildfarm.server.services.ActionCacheService; -import build.buildfarm.server.services.AdminService; import build.buildfarm.server.services.CapabilitiesService; import build.buildfarm.server.services.ExecutionService; import build.buildfarm.server.services.FetchService; import build.buildfarm.server.services.OperationQueueService; import build.buildfarm.server.services.OperationsService; import build.buildfarm.server.services.PublishBuildEventService; -import com.google.devtools.common.options.OptionsParsingException; import io.grpc.ServerBuilder; import io.grpc.ServerInterceptor; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; @@ -49,24 +48,16 @@ import java.io.File; import java.io.IOException; import java.security.Security; -import java.util.HashMap; -import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; +import java.util.concurrent.atomic.AtomicBoolean; import javax.naming.ConfigurationException; import lombok.extern.java.Log; import org.bouncycastle.jce.provider.BouncyCastleProvider; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.annotation.ComponentScan; @SuppressWarnings("deprecation") @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class BuildFarmServer { +public class BuildFarmServer extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -80,20 +71,57 @@ public class BuildFarmServer { private Instance instance; private HealthStatusManager healthStatusManager; private io.grpc.Server server; - private boolean stopping = false; private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); + private AtomicBoolean shutdownInitiated = new AtomicBoolean(true); + private AtomicBoolean released = new AtomicBoolean(true); - private ShardInstance createInstance() + BuildFarmServer() { + super("BuildFarmServer"); + } + + /** + * The method will prepare the server for graceful shutdown when the server is ready. Current + * implementation waits specified period of time. Future improvements could be to keep track of + * open connections and shutdown when there are not left. Note on using stderr here instead of + * log. By the time this is called in PreDestroy, the log is no longer available and is not + * logging messages. + */ + public void prepareServerForGracefulShutdown() { + if (configs.getServer().getGracefulShutdownSeconds() == 0) { + log.info( + String.format("Graceful Shutdown is not enabled. Server is shutting down immediately.")); + } else { + try { + log.info( + String.format( + "Graceful Shutdown - Waiting %d to allow connections to drain.", + configs.getServer().getGracefulShutdownSeconds())); + SECONDS.sleep(configs.getServer().getGracefulShutdownSeconds()); + } catch (InterruptedException e) { + log.info( + "Graceful Shutdown - The server graceful shutdown is interrupted: " + e.getMessage()); + } finally { + log.info( + String.format( + "Graceful Shutdown - It took the server %d seconds to shutdown", + configs.getServer().getGracefulShutdownSeconds())); + } + } + } + + private ServerInstance createInstance() throws IOException, ConfigurationException, InterruptedException { - return new ShardInstance( + return new ServerInstance( configs.getServer().getName(), configs.getServer().getSession() + "-" + configs.getServer().getName(), new DigestUtil(configs.getDigestFunction()), - this::stop); + this::initiateShutdown); } public synchronized void start(ServerBuilder serverBuilder, String publicName) throws IOException, ConfigurationException, InterruptedException { + shutdownInitiated.set(false); + released.set(false); instance = createInstance(); healthStatusManager = new HealthStatusManager(); @@ -123,7 +151,6 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName .addService(new ExecutionService(instance, keepaliveScheduler)) .addService(new OperationQueueService(instance)) .addService(new OperationsService(instance)) - .addService(new AdminService(instance)) .addService(new FetchService(instance)) .addService(ProtoReflectionService.newInstance()) .addService(new PublishBuildEventService()) @@ -141,9 +168,7 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName log.info(String.format("%s initialized", configs.getServer().getSession())); - checkState(!stopping, "must not call start after stop"); instance.start(publicName); - WebController.setInstance((ShardInstance) instance); server.start(); healthStatusManager.setStatus( @@ -152,38 +177,70 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName healthCheckMetric.labels("start").inc(); } - @PreDestroy - public void stop() { - synchronized (this) { - if (stopping) { - return; - } - stopping = true; + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); + } + } + + synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); } - System.err.println("*** shutting down gRPC server since JVM is shutting down"); + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); + prepareServerForGracefulShutdown(); healthStatusManager.setStatus( HealthStatusManager.SERVICE_NAME_ALL_SERVICES, ServingStatus.NOT_SERVING); PrometheusPublisher.stopHttpServer(); healthCheckMetric.labels("stop").inc(); try { - if (server != null) { - server.shutdown(); - } + initiateShutdown(); instance.stop(); - server.awaitTermination(10, TimeUnit.SECONDS); + if (server != null && server.awaitTermination(10, TimeUnit.SECONDS)) { + server = null; + } } catch (InterruptedException e) { if (server != null) { server.shutdownNow(); + server = null; } + throw e; } if (!shutdownAndAwaitTermination(keepaliveScheduler, 10, TimeUnit.SECONDS)) { log.warning("could not shut down keepalive scheduler"); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); + } + + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); } - @PostConstruct - public void init() throws OptionsParsingException { + private void initiateShutdown() { + shutdownInitiated.set(true); + if (server != null) { + server.shutdown(); + } + } + + private void awaitTermination() throws InterruptedException { + while (!shutdownInitiated.get()) { + if (server != null && server.awaitTermination(1, TimeUnit.SECONDS)) { + server = null; + shutdownInitiated.set(true); + } + } + } + + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -191,37 +248,24 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); - try { - start( - ServerBuilder.forPort(configs.getServer().getPort()), - configs.getServer().getPublicName()); - } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); - } catch (InterruptedException e) { - System.err.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); - } - } - - public static void main(String[] args) throws ConfigurationException { configs = BuildfarmConfigs.loadServerConfigs(args); // Configure Spring - SpringApplication app = new SpringApplication(BuildFarmServer.class); - Map springConfig = new HashMap<>(); - - // Disable Logback - System.setProperty("org.springframework.boot.logging.LoggingSystem", "none"); - - springConfig.put("ui.frontend.enable", configs.getUi().isEnable()); - springConfig.put("server.port", configs.getUi().getPort()); - app.setDefaultProperties(springConfig); + BuildFarmServer server = new BuildFarmServer(); try { - app.run(args); - } catch (Throwable t) { - log.log(SEVERE, "Error running application", t); + server.start( + ServerBuilder.forPort(configs.getServer().getPort()), + configs.getServer().getPublicName()); + server.awaitTermination(); + } catch (IOException e) { + log.severe("error: " + formatIOError(e)); + } catch (InterruptedException e) { + log.log(WARNING, "interrupted", e); + } catch (Exception e) { + log.log(SEVERE, "Error running application", e); + } finally { + server.stop(); } } } diff --git a/src/main/java/build/buildfarm/server/controllers/BUILD b/src/main/java/build/buildfarm/server/controllers/BUILD deleted file mode 100644 index 1e523bcef5..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/BUILD +++ /dev/null @@ -1,46 +0,0 @@ -java_library( - name = "WebController", - srcs = ["WebController.java"], - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common", - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/common/grpc", - "//src/main/java/build/buildfarm/common/resources", - "//src/main/java/build/buildfarm/common/resources:resource_java_proto", - "//src/main/java/build/buildfarm/common/services", - "//src/main/java/build/buildfarm/instance", - "//src/main/java/build/buildfarm/instance/shard", - "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/operations", - "//src/main/java/build/buildfarm/server/services", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_error_details_proto", - "@maven//:com_github_pcj_google_options", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:com_googlecode_json_simple_json_simple", - "@maven//:com_jayway_jsonpath_json_path", - "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_protobuf", - "@maven//:io_grpc_grpc_services", - "@maven//:io_prometheus_simpleclient", - "@maven//:javax_annotation_javax_annotation_api", - "@maven//:org_bouncycastle_bcprov_jdk15on", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_web", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/server/controllers/WebController.java b/src/main/java/build/buildfarm/server/controllers/WebController.java deleted file mode 100644 index 5ed5c8250e..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/WebController.java +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.controllers; - -import build.bazel.remote.execution.v2.ExecuteOperationMetadata; -import build.bazel.remote.execution.v2.ExecuteResponse; -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.instance.shard.ShardInstance; -import build.buildfarm.operations.EnrichedOperation; -import build.buildfarm.v1test.CompletedOperationMetadata; -import build.buildfarm.v1test.ExecutingOperationMetadata; -import build.buildfarm.v1test.QueuedOperationMetadata; -import com.google.longrunning.Operation; -import com.google.protobuf.Any; -import com.google.protobuf.Duration; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.Timestamp; -import com.google.protobuf.util.Durations; -import com.google.protobuf.util.JsonFormat; -import com.google.protobuf.util.Timestamps; -import com.google.rpc.PreconditionFailure; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.stereotype.Controller; -import org.springframework.ui.Model; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; - -// An HTTP frontend for providing a web UI in buildfarm. Its primarily used by developers so that -// they can introspect their build inovocations. Standalone tools that communicate over GRPC are -// sometimes less accessible. Nonetheless, the APIs should be similar. -// The web frontend can be deployed as part of the servers. However, the controller may also be -// included in other standalone tools if you'd like the frontend decoupled from the servers. -// This controller can provide web content directly through spring boot via thymeleaf. -// The controller also provides raw REST APIs for those wishing to build out their own frontend. -@Log -@Controller -@ConditionalOnProperty("ui.frontend.enable") -public class WebController { - private static ShardInstance instance; - - // Most of the routes require an instance to have their queries fulfilled. - // I wanted to have the instance set in the controller's constructor, but I was having trouble - // doing that with springboot's bean initialization and auto-wiring. - // Particularly because the controller is constructed before the instance is actually created. - // I settled for having the instance static and allowing the server startup to provide the - // instance once ready. - public static void setInstance(ShardInstance instanceIn) { - instance = instanceIn; - } - - // This is typically the starting for a developer looking to introspect their builds. - // When running a bazel client with `--bes_results_url` they will shown this URL route. - // The page is intented to give them a summary of their build invocation and allow them to drill - // down into more details. - @GetMapping("/invocation/{invocationId}") - public String invocation(Model model, @PathVariable String invocationId) { - // We need to look up the user's operations as fast as possible. Scanning all of the stored - // operations and filtering by invocatio ID (i.e. O(n)) does not scale. Instead, the invocation - // ID must be the primary key to their specific list of build operation IDs. We then lookup - // each - // operation by ID (preferably batched). This is technically two backend calls. It could be - // made faster - // at the expense of duplicate information stored in the backend. - Set operationIDs = instance.findOperationsByInvocationId(invocationId); - Iterable> foundOperations = instance.getOperations(operationIDs); - - // Populate the model for the page. - buildInvocationModel(model, invocationId, foundOperations); - - // Render page. - return "invocation"; - } - - private void buildInvocationModel( - Model model, String invocationId, Iterable> foundOperations) { - // Create an array representing table information about all the oprations involved in the - // invocation. - // This is the core content of the page. - JSONArray operationResults = new JSONArray(); - for (Map.Entry entry : foundOperations) { - Operation operation = jsonToOperation(entry.getValue()); - JSONObject obj = new JSONObject(); - obj.put("target", extractTargetId(operation)); - obj.put("mnemonic", extractActionMnemonic(operation)); - obj.put("stage", extractStatus(operation)); - obj.put("duration", extractDuration(operation)); - obj.put("worker", extractWorker(operation)); - - String operationId = entry.getKey(); - String id = operationId.substring(operationId.lastIndexOf('/')).substring(1); - obj.put("operationId", id); - - operationResults.add(obj); - } - - // Populate data to be provided to the frontend. - model.addAttribute("operation", operationResults.toJSONString()); - model.addAttribute("invocationId", String.format("Invocation: %s", invocationId)); - } - - // An operation represents an executed action. - // The operation has a target ID which corresponds to the bazel label what developers are - // typically thinking of when wishing to evaluate their build. - // This page shows them all of the information that we track related to the operation. - @GetMapping("/operation/{operationId}") - public String operation(Model model, @PathVariable String operationId) { - EnrichedOperation result = - instance.findEnrichedOperation(String.format("shard/operations/%s", operationId)); - model.addAttribute("fullOperation", result.asJsonString()); - return "operation"; - } - - // Information about the current deployment. Useful for verifying what is running. - @GetMapping("/info") - public String info(Model model) { - return "info"; - } - - /** - * @brief Convert string json into operation type. - * @details Parses json and returns null if invalid. - * @param json The json to convert to Operation type. - * @return The created operation. - * @note Suggested return identifier: operation. - */ - private static Operation jsonToOperation(String json) { - // create a json parser - JsonFormat.Parser operationParser = - JsonFormat.parser() - .usingTypeRegistry( - JsonFormat.TypeRegistry.newBuilder() - .add(CompletedOperationMetadata.getDescriptor()) - .add(ExecutingOperationMetadata.getDescriptor()) - .add(ExecuteOperationMetadata.getDescriptor()) - .add(QueuedOperationMetadata.getDescriptor()) - .add(PreconditionFailure.getDescriptor()) - .build()) - .ignoringUnknownFields(); - - if (json == null) { - log.log(Level.WARNING, "Operation Json is empty"); - return null; - } - try { - Operation.Builder operationBuilder = Operation.newBuilder(); - operationParser.merge(json, operationBuilder); - return operationBuilder.build(); - } catch (InvalidProtocolBufferException e) { - log.log(Level.WARNING, "InvalidProtocolBufferException while building an operation.", e); - return null; - } - } - - private String extractTargetId(Operation operation) { - return expectRequestMetadata(operation).getTargetId(); - } - - private String extractActionMnemonic(Operation operation) { - return expectRequestMetadata(operation).getActionMnemonic(); - } - - private String extractStatus(Operation operation) { - return String.valueOf(expectExecuteOperationMetadata(operation).getStage()); - } - - private String extractDuration(Operation operation) { - Any result = operation.getResponse(); - try { - Timestamp start = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerStartTimestamp(); - Timestamp end = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerCompletedTimestamp(); - Duration duration = Timestamps.between(start, end); - return Durations.toSecondsAsDouble(duration) + "s"; - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private String extractWorker(Operation operation) { - Any result = operation.getResponse(); - try { - return result.unpack(ExecuteResponse.class).getResult().getExecutionMetadata().getWorker(); - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private static ExecuteOperationMetadata expectExecuteOperationMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getExecuteOperationMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getExecuteOperationMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getExecuteOperationMetadata(); - } - return ExecuteOperationMetadata.getDefaultInstance(); - } - - private static RequestMetadata expectRequestMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getRequestMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getRequestMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getRequestMetadata(); - } - return RequestMetadata.getDefaultInstance(); - } - - private static QueuedOperationMetadata maybeQueuedOperationMetadata(String name, Any metadata) { - if (metadata.is(QueuedOperationMetadata.class)) { - try { - return metadata.unpack(QueuedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static ExecutingOperationMetadata maybeExecutingOperationMetadata( - String name, Any metadata) { - if (metadata.is(ExecutingOperationMetadata.class)) { - try { - return metadata.unpack(ExecutingOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static CompletedOperationMetadata maybeCompletedOperationMetadata( - String name, Any metadata) { - if (metadata.is(CompletedOperationMetadata.class)) { - try { - return metadata.unpack(CompletedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid completed operation metadata %s", name), e); - } - } - return null; - } -} diff --git a/src/main/java/build/buildfarm/server/services/AdminService.java b/src/main/java/build/buildfarm/server/services/AdminService.java deleted file mode 100644 index 94178fbf27..0000000000 --- a/src/main/java/build/buildfarm/server/services/AdminService.java +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.services; - -import static build.buildfarm.common.grpc.Channels.createChannel; - -import build.buildfarm.admin.Admin; -import build.buildfarm.admin.aws.AwsAdmin; -import build.buildfarm.admin.gcp.GcpAdmin; -import build.buildfarm.common.CasIndexResults; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.instance.Instance; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.DisableScaleInProtectionRequestResults; -import build.buildfarm.v1test.GetClientStartTimeRequest; -import build.buildfarm.v1test.GetClientStartTimeResult; -import build.buildfarm.v1test.GetHostsRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; -import build.buildfarm.v1test.ReindexCasRequest; -import build.buildfarm.v1test.ReindexCasRequestResults; -import build.buildfarm.v1test.ScaleClusterRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequestResults; -import build.buildfarm.v1test.ShutDownWorkerGrpc; -import build.buildfarm.v1test.StopContainerRequest; -import build.buildfarm.v1test.TerminateHostRequest; -import com.google.rpc.Code; -import com.google.rpc.Status; -import io.grpc.ManagedChannel; -import io.grpc.stub.StreamObserver; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AdminService extends AdminGrpc.AdminImplBase { - private final Admin adminController; - private final Instance instance; - - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public AdminService(Instance instance) { - this.adminController = getAdminController(); - this.instance = instance; - } - - @Override - public void terminateHost(TerminateHostRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.terminateHost(request.getHostId()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not terminate host.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void stopContainer(StopContainerRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.stopContainer(request.getHostId(), request.getContainerName()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not stop container.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getHosts(GetHostsRequest request, StreamObserver responseObserver) { - try { - GetHostsResult result = null; - if (adminController != null) { - result = - adminController.getHosts( - request.getFilter(), request.getAgeInMinutes(), request.getStatus()); - } - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not get hosts.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getClientStartTime( - GetClientStartTimeRequest request, - StreamObserver responseObserver) { - try { - GetClientStartTimeResult result = instance.getClientStartTime(request); - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log( - Level.SEVERE, - String.format("Could not get client start time for %s.", request.getInstanceName()), - e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void scaleCluster(ScaleClusterRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.scaleCluster( - request.getScaleGroupName(), - request.getMinHosts(), - request.getMaxHosts(), - request.getTargetHosts(), - request.getTargetReservedHostsPercent()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not scale cluster.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void reindexCas( - ReindexCasRequest request, StreamObserver responseObserver) { - try { - CasIndexResults results = instance.reindexCas(); - log.info(String.format("CAS Indexer Results: %s", results.toMessage())); - responseObserver.onNext( - ReindexCasRequestResults.newBuilder() - .setRemovedHosts(results.removedHosts) - .setRemovedKeys(results.removedKeys) - .setTotalKeys(results.totalKeys) - .build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not reindex CAS.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - /** - * Server-side implementation of ShutDownWorkerGracefully. This will reroute the request to target - * worker. - * - * @param request ShutDownWorkerGracefullyRequest received through grpc - * @param responseObserver grpc response observer - */ - @Override - public void shutDownWorkerGracefully( - ShutDownWorkerGracefullyRequest request, - StreamObserver responseObserver) { - try { - informWorkerToPrepareForShutdown(request.getWorkerName()); - responseObserver.onNext(ShutDownWorkerGracefullyRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (Exception e) { - String errorMessage = - String.format( - "Could not inform the worker %s to prepare for graceful shutdown with error %s.", - request.getWorkerName(), e.getMessage()); - log.log(Level.SEVERE, errorMessage); - responseObserver.onError(new Exception(errorMessage)); - } - } - - /** - * Inform a worker to prepare for graceful shutdown. - * - * @param host the host that should be prepared for shutdown. - */ - @SuppressWarnings("ResultOfMethodCallIgnored") - private void informWorkerToPrepareForShutdown(String host) { - ManagedChannel channel = null; - try { - channel = createChannel(host); - ShutDownWorkerGrpc.ShutDownWorkerBlockingStub shutDownWorkerBlockingStub = - ShutDownWorkerGrpc.newBlockingStub(channel); - shutDownWorkerBlockingStub.prepareWorkerForGracefulShutdown( - PrepareWorkerForGracefulShutDownRequest.newBuilder().build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - /** - * Server-side implementation of disableScaleInProtection. - * - * @param request grpc request - * @param responseObserver grpc response observer - */ - @Override - public void disableScaleInProtection( - DisableScaleInProtectionRequest request, - StreamObserver responseObserver) { - try { - String hostPrivateIp = trimHostPrivateDns(request.getInstanceName()); - adminController.disableHostScaleInProtection(hostPrivateIp); - responseObserver.onNext(DisableScaleInProtectionRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (RuntimeException e) { - responseObserver.onError(e); - } - } - - /** - * The private dns get from worker might be suffixed with ":portNumber", which should be trimmed. - * - * @param hostPrivateIp the private dns should be trimmed. - * @return - */ - @SuppressWarnings("JavaDoc") - private String trimHostPrivateDns(String hostPrivateIp) { - String portSeparator = ":"; - if (hostPrivateIp.contains(portSeparator)) { - hostPrivateIp = hostPrivateIp.split(portSeparator)[0]; - } - return hostPrivateIp; - } - - private static Admin getAdminController() { - if (configs.getServer().getAdmin().getDeploymentEnvironment() == null) { - return null; - } - switch (configs.getServer().getAdmin().getDeploymentEnvironment()) { - default: - return null; - case AWS: - return new AwsAdmin(); - case GCP: - return new GcpAdmin(); - } - } -} diff --git a/src/main/java/build/buildfarm/server/services/BUILD b/src/main/java/build/buildfarm/server/services/BUILD index 54b7af993c..6bb44b94f8 100644 --- a/src/main/java/build/buildfarm/server/services/BUILD +++ b/src/main/java/build/buildfarm/server/services/BUILD @@ -4,17 +4,12 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", - "//src/main/java/build/buildfarm/admin/gcp", "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/common/resources:resource_java_proto", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", - "//src/main/java/build/buildfarm/metrics/gcp", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", diff --git a/src/main/java/build/buildfarm/server/services/ExecutionService.java b/src/main/java/build/buildfarm/server/services/ExecutionService.java index 7f46c0bb3c..69799524fb 100644 --- a/src/main/java/build/buildfarm/server/services/ExecutionService.java +++ b/src/main/java/build/buildfarm/server/services/ExecutionService.java @@ -28,8 +28,6 @@ import build.buildfarm.common.grpc.TracingMetadataUtils; import build.buildfarm.instance.Instance; import build.buildfarm.metrics.MetricsPublisher; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; -import build.buildfarm.metrics.gcp.GcpMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.ListenableFuture; @@ -205,13 +203,6 @@ public void execute(ExecuteRequest request, StreamObserver responseOb } private static MetricsPublisher getMetricsPublisher() { - switch (configs.getServer().getMetrics().getPublisher()) { - default: - return new LogMetricsPublisher(); - case AWS: - return new AwsMetricsPublisher(); - case GCP: - return new GcpMetricsPublisher(); - } + return new LogMetricsPublisher(); } } diff --git a/src/main/java/build/buildfarm/server/services/OperationQueueService.java b/src/main/java/build/buildfarm/server/services/OperationQueueService.java index 6d3edb8178..11f6501667 100644 --- a/src/main/java/build/buildfarm/server/services/OperationQueueService.java +++ b/src/main/java/build/buildfarm/server/services/OperationQueueService.java @@ -28,9 +28,7 @@ import com.google.rpc.Code; import io.grpc.Status; import io.grpc.StatusRuntimeException; -import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; -import java.util.function.Consumer; public class OperationQueueService extends OperationQueueGrpc.OperationQueueImplBase { private final Instance instance; @@ -43,14 +41,11 @@ private static class OperationQueueMatchListener implements MatchListener { @SuppressWarnings("rawtypes") private final InterruptingPredicate onMatch; - private final Consumer setOnCancelHandler; private static final QueueEntry queueEntry = null; @SuppressWarnings("rawtypes") - OperationQueueMatchListener( - InterruptingPredicate onMatch, Consumer setOnCancelHandler) { + OperationQueueMatchListener(InterruptingPredicate onMatch) { this.onMatch = onMatch; - this.setOnCancelHandler = setOnCancelHandler; } @Override @@ -70,11 +65,6 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - setOnCancelHandler.accept(onCancelHandler); - } } private InterruptingPredicate createOnMatch( @@ -97,14 +87,10 @@ private InterruptingPredicate createOnMatch( @Override public void take(TakeOperationRequest request, StreamObserver responseObserver) { - ServerCallStreamObserver callObserver = - (ServerCallStreamObserver) responseObserver; - try { instance.match( request.getPlatform(), - new OperationQueueMatchListener( - createOnMatch(instance, responseObserver), callObserver::setOnCancelHandler)); + new OperationQueueMatchListener(createOnMatch(instance, responseObserver))); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } diff --git a/src/main/java/build/buildfarm/tools/BUILD b/src/main/java/build/buildfarm/tools/BUILD index 37c6bdb58a..44a4498ef9 100644 --- a/src/main/java/build/buildfarm/tools/BUILD +++ b/src/main/java/build/buildfarm/tools/BUILD @@ -185,9 +185,9 @@ java_binary( ) java_binary( - name = "GracefulShutdownTest", - srcs = ["GracefulShutdownTest.java"], - main_class = "build.buildfarm.tools.GracefulShutdownTest", + name = "GracefulShutdown", + srcs = ["GracefulShutdown.java"], + main_class = "build.buildfarm.tools.GracefulShutdown", visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common/grpc", diff --git a/src/main/java/build/buildfarm/tools/Cat.java b/src/main/java/build/buildfarm/tools/Cat.java index 7c8e561454..bca404b3c1 100644 --- a/src/main/java/build/buildfarm/tools/Cat.java +++ b/src/main/java/build/buildfarm/tools/Cat.java @@ -650,6 +650,9 @@ private static void getWorkerProfile(Instance instance) { private static void printStageInformation(StageInformation stage) { System.out.printf("%s slots configured: %d%n", stage.getName(), stage.getSlotsConfigured()); System.out.printf("%s slots used %d%n", stage.getName(), stage.getSlotsUsed()); + for (String operationName : stage.getOperationNamesList()) { + System.out.printf("%s operation %s\n", stage.getName(), operationName); + } } private static void printOperationTime(OperationTimesBetweenStages time) { diff --git a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java b/src/main/java/build/buildfarm/tools/GracefulShutdown.java similarity index 93% rename from src/main/java/build/buildfarm/tools/GracefulShutdownTest.java rename to src/main/java/build/buildfarm/tools/GracefulShutdown.java index 85ae5ab78d..689edfeff1 100644 --- a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java +++ b/src/main/java/build/buildfarm/tools/GracefulShutdown.java @@ -23,9 +23,9 @@ import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.ManagedChannel; -class GracefulShutdownTest { +class GracefulShutdown { /** - * Example command: GracefulShutdownTest ShutDown workerIp buildfarm-endpoint + * Example command: GracefulShutdown ShutDown workerIp buildfarm-endpoint * * @param args */ @@ -48,7 +48,7 @@ private static void shutDownGracefully(String[] args) { } /** - * Example command: GracefulShutdownTest PrepareWorker WorkerIp:port + * Example command: GracefulShutdown PrepareWorker WorkerIp:port * * @param args */ @@ -65,7 +65,7 @@ private static void prepareWorkerForShutDown(String[] args) { } /** - * Example command: GracefulShutdownTest DisableProtection WorkerIp buildfarm_endpoint + * Example command: GracefulShutdown DisableProtection WorkerIp buildfarm_endpoint * * @param args */ diff --git a/src/main/java/build/buildfarm/worker/BUILD b/src/main/java/build/buildfarm/worker/BUILD index c25b9e74d6..e24b50d731 100644 --- a/src/main/java/build/buildfarm/worker/BUILD +++ b/src/main/java/build/buildfarm/worker/BUILD @@ -36,6 +36,7 @@ java_library( "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_compress", + "@maven//:org_apache_commons_commons_lang3", "@maven//:org_jetbrains_annotations", "@maven//:org_projectlombok_lombok", ], diff --git a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java index c3c4c1dfc9..3536ed6bac 100644 --- a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java +++ b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java @@ -14,11 +14,12 @@ package build.buildfarm.worker; -import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.ExecutionProperties; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.collect.Iterables; import com.google.common.collect.SetMultimap; import org.jetbrains.annotations.NotNull; @@ -45,6 +46,7 @@ public class DequeueMatchEvaluator { * @brief Decide whether the worker should keep the operation or put it back on the queue. * @details Compares the platform properties of the worker to the operation's platform properties. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param queueEntry An entry recently removed from the queue. * @return Whether or not the worker should accept or reject the queue entry. * @note Overloaded. @@ -53,24 +55,10 @@ public class DequeueMatchEvaluator { @SuppressWarnings("NullableProblems") @NotNull public static boolean shouldKeepOperation( - SetMultimap workerProvisions, QueueEntry queueEntry) { - return shouldKeepViaPlatform(workerProvisions, queueEntry.getPlatform()); - } - - /** - * @brief Decide whether the worker should keep the operation or put it back on the queue. - * @details Compares the platform properties of the worker to the operation's platform properties. - * @param workerProvisions The provisions of the worker. - * @param command A command to evaluate. - * @return Whether or not the worker should accept or reject the queue entry. - * @note Overloaded. - * @note Suggested return identifier: shouldKeepOperation. - */ - @SuppressWarnings("NullableProblems") - @NotNull - public static boolean shouldKeepOperation( - SetMultimap workerProvisions, Command command) { - return shouldKeepViaPlatform(workerProvisions, command.getPlatform()); + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + QueueEntry queueEntry) { + return shouldKeepViaPlatform(workerProvisions, resourceSet, queueEntry.getPlatform()); } /** @@ -79,6 +67,7 @@ public static boolean shouldKeepOperation( * @details Compares the platform properties of the worker to the platform properties of the * operation. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param platform The platforms of operation. * @return Whether or not the worker should accept or reject the operation. * @note Suggested return identifier: shouldKeepOperation. @@ -86,9 +75,15 @@ public static boolean shouldKeepOperation( @SuppressWarnings("NullableProblems") @NotNull private static boolean shouldKeepViaPlatform( - SetMultimap workerProvisions, Platform platform) { - // attempt to execute everything the worker gets off the queue. + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + Platform platform) { + // attempt to execute everything the worker gets off the queue, + // provided there is enough resources to do so. // this is a recommended configuration. + if (!LocalResourceSetUtils.claimResources(platform, resourceSet)) { + return false; + } if (configs.getWorker().getDequeueMatchSettings().isAcceptEverything()) { return true; } diff --git a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java index 6c5e247a8f..f1ad546676 100644 --- a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java +++ b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java @@ -107,7 +107,7 @@ public void releaseExecutor( int slotUsage = removeAndRelease(operationName, claims); executionTime.observe(usecs / 1000.0); executionStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, @@ -141,7 +141,7 @@ protected void iterate() throws InterruptedException { executors.add(executorThread); int slotUsage = executorClaims.addAndGet(limits.cpu.claimed); executionSlotUsage.set(slotUsage); - logStart(operationContext.operation.getName(), getUsage(slotUsage)); + start(operationContext.operation.getName(), getUsage(slotUsage)); executorThread.start(); } } diff --git a/src/main/java/build/buildfarm/worker/Executor.java b/src/main/java/build/buildfarm/worker/Executor.java index 0416c2354d..f4776a6f00 100644 --- a/src/main/java/build/buildfarm/worker/Executor.java +++ b/src/main/java/build/buildfarm/worker/Executor.java @@ -359,6 +359,9 @@ public void run(ResourceLimits limits) { } finally { boolean wasInterrupted = Thread.interrupted(); try { + // Now that the execution has finished we can return any of the claims against local + // resources. + workerContext.returnLocalResources(operationContext.queueEntry); owner.releaseExecutor( operationName, limits.cpu.claimed, diff --git a/src/main/java/build/buildfarm/worker/InputFetchStage.java b/src/main/java/build/buildfarm/worker/InputFetchStage.java index 6b0f741fd8..3953ef9595 100644 --- a/src/main/java/build/buildfarm/worker/InputFetchStage.java +++ b/src/main/java/build/buildfarm/worker/InputFetchStage.java @@ -72,13 +72,14 @@ public void releaseInputFetcher( int size = removeAndRelease(operationName); inputFetchTime.observe(usecs / 1000.0); inputFetchStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, String.format("%s, %s", success ? "Success" : "Failure", getUsage(size))); } + @Override public int getSlotUsage() { return fetchers.size(); } @@ -106,8 +107,7 @@ protected void iterate() throws InterruptedException { fetchers.add(fetcher); int slotUsage = fetchers.size(); inputFetchSlotUsage.set(slotUsage); - logStart( - operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); + start(operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); fetcher.start(); } } diff --git a/src/main/java/build/buildfarm/worker/MatchStage.java b/src/main/java/build/buildfarm/worker/MatchStage.java index e245ee68d6..663aee3c52 100644 --- a/src/main/java/build/buildfarm/worker/MatchStage.java +++ b/src/main/java/build/buildfarm/worker/MatchStage.java @@ -95,20 +95,15 @@ public void onError(Throwable t) { throw new RuntimeException(t); } - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - // never called, only blocking stub used - } - @SuppressWarnings("SameReturnValue") private boolean onOperationPolled() throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - logStart(operationName); + start(operationName); long matchingAtUSecs = stopwatch.elapsed(MICROSECONDS); OperationContext matchedOperationContext = match(operationContext); long matchedInUSecs = stopwatch.elapsed(MICROSECONDS) - matchingAtUSecs; - logComplete(operationName, matchedInUSecs, waitDuration, true); + complete(operationName, matchedInUSecs, waitDuration, true); matchedOperationContext.poller.pause(); try { output.put(matchedOperationContext); @@ -139,7 +134,7 @@ protected void iterate() throws InterruptedException { } MatchOperationListener listener = new MatchOperationListener(operationContext, stopwatch); try { - logStart(); + start(); workerContext.match(listener); } finally { if (!listener.wasMatched()) { diff --git a/src/main/java/build/buildfarm/worker/OutputDirectory.java b/src/main/java/build/buildfarm/worker/OutputDirectory.java index 6245553803..caf77b273e 100644 --- a/src/main/java/build/buildfarm/worker/OutputDirectory.java +++ b/src/main/java/build/buildfarm/worker/OutputDirectory.java @@ -161,10 +161,15 @@ private static OutputDirectory parseDirectories(Iterable o Iterables.addAll(sortedOutputDirs, outputDirs); Collections.sort(sortedOutputDirs); + String currentOutputDir = ""; Builder currentBuilder = builder; String prefix = "/"; for (OutputDirectoryEntry entry : sortedOutputDirs) { String outputDir = entry.outputDirectory; + if (outputDir == currentOutputDir) { + continue; + } + currentOutputDir = outputDir; while (!outputDir.startsWith(prefix)) { currentBuilder = stack.pop(); int upPathSeparatorIndex = prefix.lastIndexOf('/', prefix.length() - 2); diff --git a/src/main/java/build/buildfarm/worker/Pipeline.java b/src/main/java/build/buildfarm/worker/Pipeline.java index 4198b537e6..0228f883ac 100644 --- a/src/main/java/build/buildfarm/worker/Pipeline.java +++ b/src/main/java/build/buildfarm/worker/Pipeline.java @@ -44,14 +44,7 @@ public void add(PipelineStage stage, int closePriority) { stageClosePriorities.put(stage, closePriority); } - /** - * Start the pipeline. - * - *

You can provide callback which is invoked when any stage has an uncaught exception, for - * instance to shutdown the worker gracefully - */ - public void start(SettableFuture uncaughtExceptionFuture) { - stageThreadGroup.setUncaughtExceptionFuture(uncaughtExceptionFuture); + public void start() { for (Thread stageThread : stageThreads.values()) { stageThread.start(); } @@ -70,9 +63,12 @@ public void close() throws InterruptedException { /** Inform MatchStage to stop matching or picking up new Operations from queue. */ public void stopMatchingOperations() { - for (PipelineStage stage : stageClosePriorities.keySet()) { + for (Map.Entry entry : stageThreads.entrySet()) { + PipelineStage stage = entry.getKey(); if (stage instanceof MatchStage) { - ((MatchStage) stage).prepareForGracefulShutdown(); + MatchStage matchStage = (MatchStage) stage; + matchStage.prepareForGracefulShutdown(); + entry.getValue().interrupt(); return; } } @@ -143,15 +139,17 @@ private void join(boolean closeStage) throws InterruptedException { } } if (stageToClose != null && !stageToClose.isClosed()) { - log.log(Level.FINER, "Closing stage at priority " + maxPriority); + log.log(Level.FINER, "Closing stage " + stageToClose + " at priority " + maxPriority); stageToClose.close(); } } + boolean longStageWait = !closeStage; for (Map.Entry stageThread : stageThreads.entrySet()) { PipelineStage stage = stageThread.getKey(); Thread thread = stageThread.getValue(); try { - thread.join(closeStage ? 1 : 1000); + // 0 is wait forever, no instant wait + thread.join(longStageWait ? 1000 : 1); } catch (InterruptedException e) { if (!closeStage) { synchronized (this) { @@ -181,8 +179,8 @@ private void join(boolean closeStage) throws InterruptedException { + stageClosePriorities.get(stage)); thread.interrupt(); } + longStageWait = false; } - closeStage = false; for (PipelineStage stage : inactiveStages) { synchronized (this) { stageThreads.remove(stage); diff --git a/src/main/java/build/buildfarm/worker/PipelineStage.java b/src/main/java/build/buildfarm/worker/PipelineStage.java index f40de3c54c..ecc78f8e7e 100644 --- a/src/main/java/build/buildfarm/worker/PipelineStage.java +++ b/src/main/java/build/buildfarm/worker/PipelineStage.java @@ -14,11 +14,13 @@ package build.buildfarm.worker; +import static java.lang.String.format; import static java.util.concurrent.TimeUnit.MICROSECONDS; import com.google.common.base.Stopwatch; import java.util.logging.Level; import java.util.logging.Logger; +import javax.annotation.Nullable; public abstract class PipelineStage implements Runnable { protected final String name; @@ -30,6 +32,7 @@ public abstract class PipelineStage implements Runnable { private volatile boolean closed = false; private Thread tickThread = null; private boolean tickCancelledFlag = false; + private String operationName = null; PipelineStage( String name, WorkerContext workerContext, PipelineStage output, PipelineStage error) { @@ -39,18 +42,29 @@ public abstract class PipelineStage implements Runnable { this.error = error; } + public String getName() { + return name; + } + private void runInterruptible() throws InterruptedException { while (!output.isClosed() || isClaimed()) { iterate(); } } + public @Nullable String getOperationName() { + return operationName; + } + @Override public void run() { try { runInterruptible(); } catch (InterruptedException e) { // ignore + } catch (Exception e) { + getLogger() + .log(Level.SEVERE, format("%s::run(): stage terminated due to exception", name), e); } finally { boolean wasInterrupted = Thread.interrupted(); try { @@ -90,7 +104,7 @@ protected void iterate() throws InterruptedException { Stopwatch stopwatch = Stopwatch.createUnstarted(); try { operationContext = take(); - logStart(operationContext.operation.getName()); + start(operationContext.operation.getName()); stopwatch.start(); boolean valid = false; tickThread = Thread.currentThread(); @@ -124,35 +138,38 @@ protected void iterate() throws InterruptedException { } after(operationContext); long usecs = stopwatch.elapsed(MICROSECONDS); - logComplete( - operationContext.operation.getName(), usecs, stallUSecs, nextOperationContext != null); + complete(operationName, usecs, stallUSecs, nextOperationContext != null); + operationName = null; } private String logIterateId(String operationName) { - return String.format("%s::iterate(%s)", name, operationName); + return format("%s::iterate(%s)", name, operationName); } - protected void logStart() { - logStart(""); + protected void start() { + start(""); } - protected void logStart(String operationName) { - logStart(operationName, "Starting"); + protected void start(String operationName) { + start(operationName, "Starting"); } - protected void logStart(String operationName, String message) { - getLogger().log(Level.FINER, String.format("%s: %s", logIterateId(operationName), message)); + protected void start(String operationName, String message) { + // TODO to unary stage + this.operationName = operationName; + getLogger().log(Level.FINER, format("%s: %s", logIterateId(operationName), message)); } - protected void logComplete(String operationName, long usecs, long stallUSecs, boolean success) { - logComplete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); + protected void complete(String operationName, long usecs, long stallUSecs, boolean success) { + complete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); } - protected void logComplete(String operationName, long usecs, long stallUSecs, String status) { + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + this.operationName = operationName; getLogger() .log( Level.FINER, - String.format( + format( "%s: %g ms (%g ms stalled) %s", logIterateId(operationName), usecs / 1000.0f, stallUSecs / 1000.0f, status)); } diff --git a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java index 64636d72c2..ed7e610a10 100644 --- a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java +++ b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java @@ -14,17 +14,21 @@ package build.buildfarm.worker; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; import java.util.logging.Level; -abstract class SuperscalarPipelineStage extends PipelineStage { +public abstract class SuperscalarPipelineStage extends PipelineStage { protected final int width; @SuppressWarnings("rawtypes") protected final BlockingQueue claims; + protected Set operationNames = new HashSet<>(); + private volatile boolean catastrophic = false; // ensure that only a single claim waits for available slots for core count @@ -46,6 +50,39 @@ public SuperscalarPipelineStage( protected abstract int claimsRequired(OperationContext operationContext); + @Override + public String getOperationName() { + throw new UnsupportedOperationException("use getOperationNames on superscalar stages"); + } + + public int getWidth() { + return width; + } + + public abstract int getSlotUsage(); + + public Iterable getOperationNames() { + synchronized (operationNames) { + return new HashSet(operationNames); + } + } + + @Override + protected void start(String operationName, String message) { + synchronized (operationNames) { + operationNames.add(operationName); + } + super.start(operationName, message); + } + + @Override + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + super.complete(operationName, usecs, stallUSecs, status); + synchronized (operationNames) { + operationNames.remove(operationName); + } + } + synchronized void waitForReleaseOrCatastrophe(BlockingQueue queue) { boolean interrupted = false; while (!catastrophic && isClaimed()) { diff --git a/src/main/java/build/buildfarm/worker/WorkerContext.java b/src/main/java/build/buildfarm/worker/WorkerContext.java index 873ad1b938..70060acea1 100644 --- a/src/main/java/build/buildfarm/worker/WorkerContext.java +++ b/src/main/java/build/buildfarm/worker/WorkerContext.java @@ -130,4 +130,6 @@ IOResource limitExecution( int commandExecutionClaims(Command command); ResourceLimits commandExecutionSettings(Command command); + + void returnLocalResources(QueueEntry queueEntry); } diff --git a/src/main/java/build/buildfarm/worker/resources/BUILD b/src/main/java/build/buildfarm/worker/resources/BUILD index ac2d69179b..36a4dc3d34 100644 --- a/src/main/java/build/buildfarm/worker/resources/BUILD +++ b/src/main/java/build/buildfarm/worker/resources/BUILD @@ -8,6 +8,7 @@ java_library( "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:com_google_guava_guava", "@maven//:com_googlecode_json_simple_json_simple", + "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_lang3", ], ) diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java new file mode 100644 index 0000000000..97d64fe9b1 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java @@ -0,0 +1,37 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Semaphore; + +/** + * @class Local Resource Set + * @brief A fixed amount of a specific resource. + * @details We define limited resources as a counting semaphores whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. These + * resources are specific to the individual worker. + */ +public class LocalResourceSet { + /** + * @field resources + * @brief A set containing resource semaphores organized by name. + * @details Key is name, and value is current usage amount. + */ + public Map resources = new HashMap<>(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java new file mode 100644 index 0000000000..787a7840e5 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java @@ -0,0 +1,46 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import io.prometheus.client.Gauge; + +/** + * @class LocalResourceSetMetrics + * @brief Tracks metrics related to a worker's limited local resources. + * @details Answers how many resources exist, how many are claimed, and by how many requesters. + */ +public class LocalResourceSetMetrics { + public static final Gauge resourceUsageMetric = + Gauge.build() + .name("local_resource_usage") + .labelNames("resource_name") + .help("The number of claims for each resource currently being used for execution") + .register(); + + public static final Gauge resourceTotalMetric = + Gauge.build() + .name("local_resource_total") + .labelNames("resource_name") + .help("The total number of claims exist for a particular resource") + .register(); + + public static final Gauge requestersMetric = + Gauge.build() + .name("local_resource_requesters") + .labelNames("resource_name") + .help( + "Tracks how many actions have requested local resources. This can help determine if resources are being hogged by some actions.") + .register(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java new file mode 100644 index 0000000000..e8a1e56131 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java @@ -0,0 +1,120 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import build.bazel.remote.execution.v2.Platform; +import build.buildfarm.common.config.LimitedResource; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Semaphore; +import org.apache.commons.lang3.StringUtils; + +/** + * @class LocalResourceSetUtils + * @brief Utilities for working with the worker's set of local limited resources. + * @details The methods help with allocation / de-allocation of claims, as well as metrics printing. + */ +public class LocalResourceSetUtils { + private static final LocalResourceSetMetrics metrics = new LocalResourceSetMetrics(); + + public static LocalResourceSet create(List resources) { + LocalResourceSet resourceSet = new LocalResourceSet(); + for (LimitedResource resource : resources) { + resourceSet.resources.put(resource.getName(), new Semaphore(resource.getAmount())); + metrics.resourceTotalMetric.labels(resource.getName()).set(resource.getAmount()); + } + return resourceSet; + } + + public static boolean claimResources(Platform platform, LocalResourceSet resourceSet) { + List> claimed = new ArrayList<>(); + + boolean allClaimed = true; + for (Platform.Property property : platform.getPropertiesList()) { + // Skip properties that are not requesting a limited resource. + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + + // Attempt to claim. If claiming fails, we must return all other claims. + int requestAmount = getResourceRequestAmount(property); + boolean wasAcquired = semaphoreAquire(resource, resourceName, requestAmount); + if (wasAcquired) { + claimed.add(new AbstractMap.SimpleEntry<>(resourceName, requestAmount)); + } else { + allClaimed = false; + break; + } + } + + // cleanup remaining resources if they were not all claimed. + if (!allClaimed) { + for (Map.Entry claim : claimed) { + semaphoreRelease( + resourceSet.resources.get(claim.getKey()), claim.getKey(), claim.getValue()); + } + } + + return allClaimed; + } + + public static void releaseClaims(Platform platform, LocalResourceSet resourceSet) { + for (Platform.Property property : platform.getPropertiesList()) { + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + int requestAmount = getResourceRequestAmount(property); + semaphoreRelease(resource, resourceName, requestAmount); + } + } + + private static boolean semaphoreAquire(Semaphore resource, String resourceName, int amount) { + boolean wasAcquired = resource.tryAcquire(amount); + if (wasAcquired) { + metrics.resourceUsageMetric.labels(resourceName).inc(amount); + } + metrics.requestersMetric.labels(resourceName).inc(); + return wasAcquired; + } + + private static void semaphoreRelease(Semaphore resource, String resourceName, int amount) { + resource.release(amount); + metrics.resourceUsageMetric.labels(resourceName).dec(amount); + metrics.requestersMetric.labels(resourceName).dec(); + } + + private static int getResourceRequestAmount(Platform.Property property) { + // We support resource values that are not numbers and interpret them as a request for 1 + // resource. For example "gpu:RTX-4090" is equivalent to resource:gpu:1". + try { + return Integer.parseInt(property.getValue()); + } catch (NumberFormatException e) { + return 1; + } + } + + private static String getResourceName(Platform.Property property) { + // We match to keys whether they are prefixed "resource:" or not. + // "resource:gpu:1" requests the gpu resource in the same way that "gpu:1" does. + // The prefix originates from bazel's syntax for the --extra_resources flag. + return StringUtils.removeStart(property.getName(), "resource:"); + } +} diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index beeaaae918..b62e4369de 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -4,8 +4,6 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", "//src/main/java/build/buildfarm/backplane", "//src/main/java/build/buildfarm/cas", "//src/main/java/build/buildfarm/common", @@ -23,6 +21,8 @@ java_library( "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@googleapis//:google_rpc_error_details_java_proto", + "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@maven//:com_github_ben_manes_caffeine_caffeine", "@maven//:com_github_pcj_google_options", "@maven//:com_google_code_findbugs_jsr305", @@ -38,10 +38,5 @@ java_library( "@maven//:io_prometheus_simpleclient", "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", ], ) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 500dcb2f6d..264151ef48 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -19,7 +19,6 @@ import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.Iterables.concat; -import static com.google.common.collect.Iterables.filter; import static com.google.common.util.concurrent.Futures.allAsList; import static com.google.common.util.concurrent.Futures.catchingAsync; import static com.google.common.util.concurrent.Futures.immediateFailedFuture; @@ -41,13 +40,15 @@ import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.common.BuildfarmExecutors; +import build.buildfarm.common.DigestUtil; import build.buildfarm.common.io.Directories; import build.buildfarm.common.io.Dirent; import build.buildfarm.worker.ExecDirException; import build.buildfarm.worker.ExecDirException.ViolationException; import build.buildfarm.worker.OutputDirectory; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; import com.google.common.util.concurrent.ListenableFuture; import java.io.IOException; import java.io.InputStream; @@ -55,14 +56,17 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.UserPrincipal; -import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.Stack; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.function.Consumer; import java.util.logging.Level; +import java.util.regex.Pattern; import javax.annotation.Nullable; import lombok.extern.java.Log; @@ -75,8 +79,11 @@ class CFCExecFileSystem implements ExecFileSystem { // perform first-available non-output symlinking and retain directories in cache private final boolean linkInputDirectories; - // override the symlinking above for a set of matching paths - private final Iterable realInputDirectories; + // indicate symlinking above for a set of matching paths + private final Iterable linkedInputDirectories; + + // permit symlinks to point to absolute paths in inputs + private final boolean allowSymlinkTargetAbsolute; private final Map> rootKeys = new ConcurrentHashMap<>(); private final Map> rootInputFiles = new ConcurrentHashMap<>(); @@ -91,14 +98,18 @@ class CFCExecFileSystem implements ExecFileSystem { CASFileCache fileCache, @Nullable UserPrincipal owner, boolean linkInputDirectories, - Iterable realInputDirectories, + Iterable linkedInputDirectories, + boolean allowSymlinkTargetAbsolute, ExecutorService removeDirectoryService, ExecutorService accessRecorder) { this.root = root; this.fileCache = fileCache; this.owner = owner; this.linkInputDirectories = linkInputDirectories; - this.realInputDirectories = realInputDirectories; + this.linkedInputDirectories = + Iterables.transform( + linkedInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); + this.allowSymlinkTargetAbsolute = allowSymlinkTargetAbsolute; this.removeDirectoryService = removeDirectoryService; this.accessRecorder = accessRecorder; } @@ -174,7 +185,7 @@ public InputStream newInput(Compressor.Value compressor, Digest digest, long off private ListenableFuture putSymlink(Path path, SymlinkNode symlinkNode) { Path symlinkPath = path.resolve(symlinkNode.getName()); Path relativeTargetPath = path.getFileSystem().getPath(symlinkNode.getTarget()); - checkState(!relativeTargetPath.isAbsolute()); + checkState(allowSymlinkTargetAbsolute || !relativeTargetPath.isAbsolute()); return listeningDecorator(fetchService) .submit( () -> { @@ -240,6 +251,7 @@ private Iterable> fetchInputs( Digest directoryDigest, Map directoriesIndex, OutputDirectory outputDirectory, + Set linkedInputDirectories, Consumer onKey, ImmutableList.Builder inputDirectories) throws IOException { @@ -263,13 +275,16 @@ private Iterable> fetchInputs( .map(symlinkNode -> putSymlink(path, symlinkNode)) .collect(ImmutableList.toImmutableList())); + ImmutableList.Builder> linkedDirectories = ImmutableList.builder(); for (DirectoryNode directoryNode : directory.getDirectoriesList()) { Digest digest = directoryNode.getDigest(); String name = directoryNode.getName(); OutputDirectory childOutputDirectory = outputDirectory != null ? outputDirectory.getChild(name) : null; Path dirPath = path.resolve(name); - if (childOutputDirectory != null || !linkInputDirectories) { + if (childOutputDirectory != null + || !linkInputDirectories + || !linkedInputDirectories.contains(dirPath)) { Files.createDirectories(dirPath); downloads = concat( @@ -280,32 +295,27 @@ private Iterable> fetchInputs( digest, directoriesIndex, childOutputDirectory, + linkedInputDirectories, onKey, inputDirectories)); } else { - downloads = - concat( - downloads, - ImmutableList.of( - transform( - linkDirectory(dirPath, digest, directoriesIndex), - (result) -> { - // note: this could non-trivial make sync due to - // the way decrementReferences is implemented. - // we saw null entries in the built immutable list - // without synchronization - synchronized (inputDirectories) { - inputDirectories.add(digest); - } - return null; - }, - fetchService))); + linkedDirectories.add( + transform( + linkDirectory(dirPath, digest, directoriesIndex), + (result) -> { + // we saw null entries in the built immutable list without synchronization + synchronized (inputDirectories) { + inputDirectories.add(digest); + } + return null; + }, + fetchService)); } if (Thread.currentThread().isInterrupted()) { break; } } - return downloads; + return concat(downloads, linkedDirectories.build()); } @SuppressWarnings("ConstantConditions") @@ -313,8 +323,14 @@ private ListenableFuture linkDirectory( Path execPath, Digest digest, Map directoriesIndex) { return transformAsync( fileCache.putDirectory(digest, directoriesIndex, fetchService), - (cachePath) -> { - Files.createSymbolicLink(execPath, cachePath); + pathResult -> { + Path path = pathResult.getPath(); + if (pathResult.getMissed()) { + log.fine( + String.format( + "putDirectory(%s, %s) created", execPath, DigestUtil.toString(digest))); + } + Files.createSymbolicLink(execPath, path); return immediateFuture(null); }, fetchService); @@ -326,30 +342,76 @@ private static void checkExecErrors(Path path, List errors) throws Ex } } - private static boolean treeContainsPath( - String directoryPath, Map directoriesIndex, Digest rootDigest) { - Directory directory = directoriesIndex.get(rootDigest); - for (String name : directoryPath.split("/")) { - List subdirs = directory.getDirectoriesList(); - int index = Collections.binarySearch(Lists.transform(subdirs, DirectoryNode::getName), name); - if (index < 0) { - return false; + private static Iterator directoriesIterator( + Digest digest, Map directoriesIndex) { + Directory root = directoriesIndex.get(digest); + return new Iterator() { + boolean atEnd = root.getDirectoriesCount() == 0; + Stack path = new Stack<>(); + Stack> route = new Stack<>(); + Iterator current = root.getDirectoriesList().iterator(); + + @Override + public boolean hasNext() { + return !atEnd; } - directory = directoriesIndex.get(subdirs.get(index).getDigest()); - } - return true; + + @Override + public String next() { + String nextPath; + DirectoryNode next = current.next(); + String name = next.getName(); + path.push(name); + nextPath = String.join("/", path); + Digest digest = next.getDigest(); + if (digest.getSizeBytes() != 0) { + route.push(current); + current = directoriesIndex.get(digest).getDirectoriesList().iterator(); + } else { + path.pop(); + } + while (!current.hasNext() && !route.isEmpty()) { + current = route.pop(); + path.pop(); + } + atEnd = !current.hasNext(); + return nextPath; + } + }; } - private Iterable realDirectories( + private Set linkedDirectories( Map directoriesIndex, Digest rootDigest) { // skip this search if all the directories are real if (linkInputDirectories) { - // somewhat inefficient, but would need many overrides to be painful - return filter( - realInputDirectories, - realInputDirectory -> treeContainsPath(realInputDirectory, directoriesIndex, rootDigest)); + ImmutableSet.Builder builder = ImmutableSet.builder(); + + Iterator dirs = directoriesIterator(rootDigest, directoriesIndex); + while (dirs.hasNext()) { + String dir = dirs.next(); + for (Pattern pattern : linkedInputDirectories) { + if (pattern.matcher(dir).matches()) { + builder.add(dir); + break; // avoid adding the same directory twice + } + } + } + return builder.build(); } - return ImmutableList.of(); + return ImmutableSet.of(); + } + + private OutputDirectory createOutputDirectory(Command command) { + Iterable files; + Iterable dirs; + if (command.getOutputPathsCount() != 0) { + files = command.getOutputPathsList(); + dirs = ImmutableList.of(); // output paths require the action to create their own directory + } else { + files = command.getOutputFilesList(); + dirs = command.getOutputDirectoriesList(); + } + return OutputDirectory.parse(files, dirs, command.getEnvironmentVariablesList()); } @Override @@ -358,13 +420,7 @@ public Path createExecDir( throws IOException, InterruptedException { log.log(Level.FINEST, "ExecFileSystem::createExecDir(" + operationName + ")"); Digest inputRootDigest = action.getInputRootDigest(); - OutputDirectory outputDirectory = - OutputDirectory.parse( - command.getOutputFilesList(), - concat( - command.getOutputDirectoriesList(), - realDirectories(directoriesIndex, inputRootDigest)), - command.getEnvironmentVariablesList()); + OutputDirectory outputDirectory = createOutputDirectory(command); Path execDir = root.resolve(operationName); if (Files.exists(execDir)) { @@ -375,6 +431,12 @@ public Path createExecDir( ImmutableList.Builder inputFiles = new ImmutableList.Builder<>(); ImmutableList.Builder inputDirectories = new ImmutableList.Builder<>(); + Set linkedInputDirectories = + ImmutableSet.copyOf( + Iterables.transform( + linkedDirectories(directoriesIndex, inputRootDigest), + path -> execDir.resolve(path))); // does this work on windows with / separators? + log.log( Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") calling fetchInputs"); // Get lock keys so we can increment them prior to downloading @@ -390,6 +452,7 @@ public Path createExecDir( inputRootDigest, directoriesIndex, outputDirectory, + linkedInputDirectories, key -> { synchronized (inputFiles) { inputFiles.add(key); diff --git a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java index d745a6194b..78f4e11dd1 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java @@ -38,7 +38,6 @@ class ShardCASFileCache extends CASFileCache { long maxEntrySizeInBytes, int maxBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -53,7 +52,6 @@ class ShardCASFileCache extends CASFileCache { maxEntrySizeInBytes, maxBucketLevels, storeFileDirsIndexInMemory, - publishTtlMetric, execRootFallback, digestUtil, expireService, diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 2516791662..a5232d5153 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -32,6 +32,7 @@ import build.bazel.remote.execution.v2.ExecutionStage; import build.bazel.remote.execution.v2.FileNode; import build.bazel.remote.execution.v2.Platform; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.backplane.Backplane; import build.buildfarm.common.CommandUtils; @@ -61,6 +62,8 @@ import build.buildfarm.worker.cgroup.Cpu; import build.buildfarm.worker.cgroup.Group; import build.buildfarm.worker.cgroup.Mem; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import build.buildfarm.worker.resources.ResourceDecider; import build.buildfarm.worker.resources.ResourceLimits; import com.google.common.annotations.VisibleForTesting; @@ -93,6 +96,7 @@ import java.util.Map; import java.util.Stack; import java.util.logging.Level; +import javax.annotation.Nullable; import lombok.extern.java.Log; @Log @@ -129,6 +133,7 @@ class ShardWorkerContext implements WorkerContext { private final Group operationsGroup = executionsGroup.getChild("operations"); private final CasWriter writer; private final boolean errorOperationRemainingResources; + private final LocalResourceSet resourceSet; static SetMultimap getMatchProvisions( Iterable policies, int executeStageWidth) { @@ -162,6 +167,7 @@ static SetMultimap getMatchProvisions( boolean onlyMulticoreTests, boolean allowBringYourOwnContainer, boolean errorOperationRemainingResources, + LocalResourceSet resourceSet, CasWriter writer) { this.name = name; this.matchProvisions = getMatchProvisions(policies, executeStageWidth); @@ -182,6 +188,7 @@ static SetMultimap getMatchProvisions( this.onlyMulticoreTests = onlyMulticoreTests; this.allowBringYourOwnContainer = allowBringYourOwnContainer; this.errorOperationRemainingResources = errorOperationRemainingResources; + this.resourceSet = resourceSet; this.writer = writer; } @@ -273,6 +280,12 @@ public QueuedOperation getQueuedOperation(QueueEntry queueEntry) @SuppressWarnings("ConstantConditions") private void matchInterruptible(MatchListener listener) throws IOException, InterruptedException { + QueueEntry queueEntry = takeEntryOffOperationQueue(listener); + decideWhetherToKeepOperation(queueEntry, listener); + } + + private QueueEntry takeEntryOffOperationQueue(MatchListener listener) + throws IOException, InterruptedException { listener.onWaitStart(); QueueEntry queueEntry = null; try { @@ -294,9 +307,13 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte // transient backplane errors will propagate a null queueEntry } listener.onWaitEnd(); + return queueEntry; + } + private void decideWhetherToKeepOperation(QueueEntry queueEntry, MatchListener listener) + throws IOException, InterruptedException { if (queueEntry == null - || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, queueEntry)) { + || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, resourceSet, queueEntry)) { listener.onEntry(queueEntry); } else { backplane.rejectOperation(queueEntry); @@ -306,6 +323,11 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte } } + @Override + public void returnLocalResources(QueueEntry queueEntry) { + LocalResourceSetUtils.releaseClaims(queueEntry.getPlatform(), resourceSet); + } + @Override public void match(MatchListener listener) throws InterruptedException { RetryingMatchListener dedupMatchListener = @@ -314,7 +336,7 @@ public void match(MatchListener listener) throws InterruptedException { @Override public boolean getMatched() { - return !matched; + return matched; } @Override @@ -328,20 +350,28 @@ public void onWaitEnd() { } @Override - public boolean onEntry(QueueEntry queueEntry) throws InterruptedException { + public boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException { if (queueEntry == null) { matched = true; return listener.onEntry(null); } + return onValidEntry(queueEntry); + } + + private boolean onValidEntry(QueueEntry queueEntry) throws InterruptedException { String operationName = queueEntry.getExecuteEntry().getOperationName(); if (activeOperations.putIfAbsent(operationName, queueEntry) != null) { log.log(Level.WARNING, "matched duplicate operation " + operationName); return false; } + return onUniqueEntry(queueEntry); + } + + private boolean onUniqueEntry(QueueEntry queueEntry) throws InterruptedException { matched = true; boolean success = listener.onEntry(queueEntry); if (!success) { - requeue(operationName); + requeue(queueEntry.getExecuteEntry().getOperationName()); } return success; } @@ -351,13 +381,8 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - listener.setOnCancelHandler(onCancelHandler); - } }; - while (dedupMatchListener.getMatched()) { + while (!dedupMatchListener.getMatched()) { try { matchInterruptible(dedupMatchListener); } catch (IOException e) { @@ -550,6 +575,7 @@ private void uploadOutputFile( static class OutputDirectoryContext { private final List files = new ArrayList<>(); private final List directories = new ArrayList<>(); + private final List symlinks = new ArrayList<>(); void addFile(FileNode fileNode) { files.add(fileNode); @@ -559,10 +585,19 @@ void addDirectory(DirectoryNode directoryNode) { directories.add(directoryNode); } + void addSymlink(SymlinkNode symlinkNode) { + symlinks.add(symlinkNode); + } + Directory toDirectory() { files.sort(Comparator.comparing(FileNode::getName)); directories.sort(Comparator.comparing(DirectoryNode::getName)); - return Directory.newBuilder().addAllFiles(files).addAllDirectories(directories).build(); + symlinks.sort(Comparator.comparing(SymlinkNode::getName)); + return Directory.newBuilder() + .addAllFiles(files) + .addAllDirectories(directories) + .addAllSymlinks(symlinks) + .build(); } } @@ -599,8 +634,30 @@ private void uploadOutputDirectory( @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (configs.getWorker().isCreateSymlinkOutputs() && attrs.isSymbolicLink()) { + visitSymbolicLink(file); + } else { + visitRegularFile(file, attrs); + } + return FileVisitResult.CONTINUE; + } + + private void visitSymbolicLink(Path file) throws IOException { + // TODO convert symlinks with absolute targets within execution root to relative ones + currentDirectory.addSymlink( + SymlinkNode.newBuilder() + .setName(file.getFileName().toString()) + .setTarget(Files.readSymbolicLink(file).toString()) + .build()); + } + + private void visitRegularFile(Path file, BasicFileAttributes attrs) throws IOException { Digest digest; try { + // should we create symlink nodes in output? + // is buildstream trying to execute in a specific container?? + // can get to NSFE for nonexistent symlinks + // can fail outright for a symlink to a directory digest = getDigestUtil().compute(file); } catch (NoSuchFileException e) { log.log( @@ -609,7 +666,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) "error visiting file %s under output dir %s", outputDirPath.relativize(file), outputDirPath.toAbsolutePath()), e); - return FileVisitResult.CONTINUE; + return; } // should we cast to PosixFilePermissions and do gymnastics there for executable? @@ -633,7 +690,6 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) .setDescription( "An output could not be uploaded because it exceeded the maximum size of an entry"); } - return FileVisitResult.CONTINUE; } @Override diff --git a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java index 711c3cf0d3..7d394c9731 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java +++ b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java @@ -18,7 +18,6 @@ import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequestResults; import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.stub.StreamObserver; -import java.util.concurrent.CompletableFuture; import lombok.extern.java.Log; @Log @@ -41,7 +40,7 @@ public void prepareWorkerForGracefulShutdown( PrepareWorkerForGracefulShutDownRequest request, StreamObserver responseObserver) { try { - CompletableFuture.runAsync(worker::prepareWorkerForGracefulShutdown); + worker.initiateShutdown(); responseObserver.onNext(PrepareWorkerForGracefulShutDownRequestResults.newBuilder().build()); responseObserver.onCompleted(); } catch (Exception e) { diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index d740dd7341..7f1bee11b8 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -36,6 +36,7 @@ import build.buildfarm.common.BuildfarmExecutors; import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Cas; import build.buildfarm.common.config.GrpcMetrics; @@ -58,6 +59,8 @@ import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.ReportResultStage; +import build.buildfarm.worker.SuperscalarPipelineStage; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import com.google.common.util.concurrent.SettableFuture; @@ -70,6 +73,7 @@ import io.grpc.Status; import io.grpc.Status.Code; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; +import io.grpc.protobuf.services.ProtoReflectionService; import io.grpc.services.HealthStatusManager; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; @@ -88,22 +92,14 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import javax.annotation.Nullable; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; import javax.naming.ConfigurationException; import lombok.extern.java.Log; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.ApplicationContext; -import org.springframework.context.annotation.ComponentScan; @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class Worker { +public final class Worker extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -132,7 +128,7 @@ public class Worker { private boolean inGracefulShutdown = false; private boolean isPaused = false; - private ShardWorkerInstance instance; + private WorkerInstance instance; @SuppressWarnings("deprecation") private final HealthStatusManager healthStatusManager = new HealthStatusManager(); @@ -144,8 +140,8 @@ public class Worker { private Pipeline pipeline; private Backplane backplane; private LoadingCache workerStubs; + private AtomicBoolean released = new AtomicBoolean(true); - @Autowired private ApplicationContext springContext; /** * The method will prepare the worker for graceful shutdown when the worker is ready. Note on * using stderr here instead of log. By the time this is called in PreDestroy, the log is no @@ -153,37 +149,35 @@ public class Worker { */ public void prepareWorkerForGracefulShutdown() { if (configs.getWorker().getGracefulShutdownSeconds() == 0) { - System.err.println( + log.info( String.format( "Graceful Shutdown is not enabled. Worker is shutting down without finishing executions in progress.")); } else { inGracefulShutdown = true; - System.err.println( + log.info( "Graceful Shutdown - The current worker will not be registered again and should be shutdown gracefully!"); pipeline.stopMatchingOperations(); int scanRate = 30; // check every 30 seconds int timeWaited = 0; int timeOut = configs.getWorker().getGracefulShutdownSeconds(); - try { if (pipeline.isEmpty()) { - System.err.println("Graceful Shutdown - no work in the pipeline."); + log.info("Graceful Shutdown - no work in the pipeline."); } else { - System.err.println( - String.format("Graceful Shutdown - waiting for executions to finish.")); + log.info(String.format("Graceful Shutdown - waiting for executions to finish.")); } while (!pipeline.isEmpty() && timeWaited < timeOut) { SECONDS.sleep(scanRate); timeWaited += scanRate; - System.err.println( + log.info( String.format( "Graceful Shutdown - Pipeline is still not empty after %d seconds.", timeWaited)); } } catch (InterruptedException e) { - System.err.println( + log.info( "Graceful Shutdown - The worker gracefully shutdown is interrupted: " + e.getMessage()); } finally { - System.err.println( + log.info( String.format( "Graceful Shutdown - It took the worker %d seconds to %s", timeWaited, @@ -194,41 +188,8 @@ public void prepareWorkerForGracefulShutdown() { } } - private void exitPostPipelineFailure() { - // Shutdown the worker if a pipeline fails. By means of the spring lifecycle - // hooks - e.g. the `PreDestroy` hook here - it will attempt to gracefully - // spin down the pipeline - - // By calling these spring shutdown facilities; we're open to the risk that - // a subsystem may be hanging a criticial thread indeffinitly. Deadline the - // shutdown workflow to ensure we don't leave a zombie worker in this - // situation - ScheduledExecutorService shutdownDeadlineExecutor = newSingleThreadScheduledExecutor(); - - // This may be shorter than the action timeout; assume we have interrupted - // actions in a fatal uncaught exception. - int forceShutdownDeadline = 60; - ScheduledFuture termFuture = - shutdownDeadlineExecutor.schedule( - new Runnable() { - public void run() { - log.log( - Level.SEVERE, - String.format( - "Force terminating due to shutdown deadline exceeded (%d seconds)", - forceShutdownDeadline)); - System.exit(1); - } - }, - forceShutdownDeadline, - SECONDS); - - // Consider defining exit codes to better afford out of band instance - // recovery - int code = SpringApplication.exit(springContext, () -> 1); - termFuture.cancel(false); - shutdownDeadlineExecutor.shutdown(); - System.exit(code); + public Worker() { + super("BuildFarmShardWorker"); } private Operation stripOperation(Operation operation) { @@ -241,7 +202,7 @@ private Operation stripQueuedOperation(Operation operation) { private Server createServer( ServerBuilder serverBuilder, - ContentAddressableStorage storage, + @Nullable CASFileCache storage, Instance instance, Pipeline pipeline, ShardWorkerContext context) { @@ -249,19 +210,20 @@ private Server createServer( serverBuilder.addService(new ContentAddressableStorageService(instance)); serverBuilder.addService(new ByteStreamService(instance)); serverBuilder.addService(new ShutDownWorkerGracefully(this)); + serverBuilder.addService(ProtoReflectionService.newInstance()); // We will build a worker's server based on it's capabilities. // A worker that is capable of execution will construct an execution pipeline. // It will use various execution phases for it's profile service. // On the other hand, a worker that is only capable of CAS storage does not need a pipeline. if (configs.getWorker().getCapabilities().isExecution()) { - PipelineStage completeStage = - new PutOperationStage((operation) -> context.deactivate(operation.getName())); + PutOperationStage completeStage = + new PutOperationStage(operation -> context.deactivate(operation.getName())); PipelineStage errorStage = completeStage; /* new ErrorStage(); */ PipelineStage reportResultStage = new ReportResultStage(context, completeStage, errorStage); - PipelineStage executeActionStage = + SuperscalarPipelineStage executeActionStage = new ExecuteActionStage(context, reportResultStage, errorStage); - PipelineStage inputFetchStage = + SuperscalarPipelineStage inputFetchStage = new InputFetchStage(context, executeActionStage, new PutOperationStage(context::requeue)); PipelineStage matchStage = new MatchStage(context, inputFetchStage, errorStage); @@ -272,7 +234,13 @@ private Server createServer( serverBuilder.addService( new WorkerProfileService( - storage, inputFetchStage, executeActionStage, context, completeStage, backplane)); + storage, + matchStage, + inputFetchStage, + executeActionStage, + reportResultStage, + completeStage, + backplane)); } GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, configs.getWorker().getGrpcMetrics()); serverBuilder.intercept(new ServerHeadersInterceptor()); @@ -398,7 +366,6 @@ private ContentAddressableStorage createStorage( // delegate level cas.getHexBucketLevels(), cas.isFileDirectoriesIndexInMemory(), - cas.isPublishTtlMetric(), cas.isExecRootCopyFallback(), digestUtil, removeDirectoryService, @@ -420,7 +387,8 @@ private ExecFileSystem createCFCExecFileSystem( fileCache, owner, configs.getWorker().isLinkInputDirectories(), - configs.getWorker().getRealInputDirectories(), + configs.getWorker().getLinkedInputDirectories(), + configs.isAllowSymlinkTargetAbsolute(), removeDirectoryService, accessRecorder /* deadlineAfter=*/ @@ -546,13 +514,6 @@ public void run() { } } catch (InterruptedException e) { // ignore - } finally { - try { - stop(); - } catch (InterruptedException ie) { - log.log(SEVERE, "interrupted while stopping worker", ie); - // ignore - } } } }, @@ -572,6 +533,7 @@ private long loadWorkerStartTimeInMillis() { } public void start() throws ConfigurationException, InterruptedException, IOException { + released.set(false); String session = UUID.randomUUID().toString(); ServerBuilder serverBuilder = ServerBuilder.forPort(configs.getWorker().getPort()); String identifier = "buildfarm-worker-" + configs.getWorker().getPublicName() + "-" + session; @@ -584,7 +546,12 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep if (SHARD.equals(configs.getBackplane().getType())) { backplane = - new RedisShardBackplane(identifier, this::stripOperation, this::stripQueuedOperation); + new RedisShardBackplane( + identifier, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + this::stripOperation, + this::stripQueuedOperation); backplane.start(configs.getWorker().getPublicName()); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); @@ -616,8 +583,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep remoteInputStreamFactory, removeDirectoryService, accessRecorder, storage); instance = - new ShardWorkerInstance( - configs.getWorker().getPublicName(), digestUtil, backplane, storage); + new WorkerInstance(configs.getWorker().getPublicName(), digestUtil, backplane, storage); // Create the appropriate writer for the context CasWriter writer; @@ -650,10 +616,11 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep configs.getWorker().isOnlyMulticoreTests(), configs.getWorker().isAllowBringYourOwnContainer(), configs.getWorker().isErrorOperationRemainingResources(), + LocalResourceSetUtils.create(configs.getWorker().getResources()), writer); pipeline = new Pipeline(); - server = createServer(serverBuilder, storage, instance, pipeline, context); + server = createServer(serverBuilder, (CASFileCache) storage, instance, pipeline, context); removeWorker(configs.getWorker().getPublicName()); @@ -667,12 +634,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep PrometheusPublisher.startHttpServer(configs.getPrometheusPort()); startFailsafeRegistration(); - // Listen for pipeline unhandled exceptions - ExecutorService pipelineExceptionExecutor = newSingleThreadExecutor(); - SettableFuture pipelineExceptionFuture = SettableFuture.create(); - pipelineExceptionFuture.addListener(this::exitPostPipelineFailure, pipelineExceptionExecutor); - - pipeline.start(pipelineExceptionFuture); + pipeline.start(); healthCheckMetric.labels("start").inc(); executionSlotsTotal.set(configs.getWorker().getExecuteStageWidth()); @@ -681,9 +643,41 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep log.log(INFO, String.format("%s initialized", identifier)); } - @PreDestroy - public void stop() throws InterruptedException { - System.err.println("*** shutting down gRPC server since JVM is shutting down"); + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); + } + + private void awaitTermination() throws InterruptedException { + pipeline.join(); + server.awaitTermination(); + } + + public void initiateShutdown() { + pipeline.stopMatchingOperations(); + if (server != null) { + server.shutdown(); + } + } + + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); + } + } + + public synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); + } + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); prepareWorkerForGracefulShutdown(); PrometheusPublisher.stopHttpServer(); boolean interrupted = Thread.interrupted(); @@ -702,11 +696,12 @@ public void stop() throws InterruptedException { executionSlotsTotal.set(0); inputFetchSlotsTotal.set(0); if (execFileSystem != null) { - log.log(INFO, "Stopping exec filesystem"); + log.info("Stopping exec filesystem"); execFileSystem.stop(); + execFileSystem = null; } if (server != null) { - log.log(INFO, "Shutting down the server"); + log.info("Shutting down the server"); server.shutdown(); try { @@ -717,26 +712,28 @@ public void stop() throws InterruptedException { } finally { server.shutdownNow(); } + server = null; } if (backplane != null) { try { backplane.stop(); + backplane = null; } catch (InterruptedException e) { interrupted = true; } } if (workerStubs != null) { workerStubs.invalidateAll(); + workerStubs = null; } if (interrupted) { Thread.currentThread().interrupt(); throw new InterruptedException(); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); } - @PostConstruct - public void init() throws OptionsParsingException { + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -744,19 +741,17 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); + configs = BuildfarmConfigs.loadWorkerConfigs(args); + Worker worker = new Worker(); try { - start(); + worker.start(); + worker.awaitTermination(); } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); + log.severe(formatIOError(e)); } catch (InterruptedException e) { - System.out.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); + log.log(Level.WARNING, "interrupted", e); + } finally { + worker.stop(); } } - - public static void main(String[] args) throws ConfigurationException { - configs = BuildfarmConfigs.loadWorkerConfigs(args); - SpringApplication.run(Worker.class, args); - } } diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java similarity index 98% rename from src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java rename to src/main/java/build/buildfarm/worker/shard/WorkerInstance.java index a891af7faa..e100417f53 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java @@ -36,7 +36,7 @@ import build.buildfarm.common.Write; import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -68,13 +68,13 @@ import lombok.extern.java.Log; @Log -public class ShardWorkerInstance extends AbstractServerInstance { +public class WorkerInstance extends NodeInstance { private static final Counter IO_METRIC = Counter.build().name("io_bytes_read").help("Read I/O (bytes)").register(); private final Backplane backplane; - public ShardWorkerInstance( + public WorkerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -346,7 +346,7 @@ protected static ExecuteOperationMetadata expectExecuteOperationMetadata(Operati return null; } } else { - return AbstractServerInstance.expectExecuteOperationMetadata(operation); + return NodeInstance.expectExecuteOperationMetadata(operation); } } diff --git a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java index 6eee990426..f00c57cc3d 100644 --- a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java @@ -15,7 +15,6 @@ package build.buildfarm.worker.shard; import build.buildfarm.backplane.Backplane; -import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.v1test.OperationTimesBetweenStages; import build.buildfarm.v1test.StageInformation; @@ -24,67 +23,90 @@ import build.buildfarm.v1test.WorkerProfileGrpc; import build.buildfarm.v1test.WorkerProfileMessage; import build.buildfarm.v1test.WorkerProfileRequest; -import build.buildfarm.worker.ExecuteActionStage; -import build.buildfarm.worker.InputFetchStage; import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.PutOperationStage.OperationStageDurations; -import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.SuperscalarPipelineStage; import io.grpc.stub.StreamObserver; import java.io.IOException; +import javax.annotation.Nullable; public class WorkerProfileService extends WorkerProfileGrpc.WorkerProfileImplBase { - private final CASFileCache storage; - private final InputFetchStage inputFetchStage; - private final ExecuteActionStage executeActionStage; - private final WorkerContext context; + private final @Nullable CASFileCache storage; + private final PipelineStage matchStage; + private final SuperscalarPipelineStage inputFetchStage; + private final SuperscalarPipelineStage executeActionStage; + private final PipelineStage reportResultStage; private final PutOperationStage completeStage; private final Backplane backplane; public WorkerProfileService( - ContentAddressableStorage storage, - PipelineStage inputFetchStage, - PipelineStage executeActionStage, - WorkerContext context, - PipelineStage completeStage, + @Nullable CASFileCache storage, + PipelineStage matchStage, + SuperscalarPipelineStage inputFetchStage, + SuperscalarPipelineStage executeActionStage, + PipelineStage reportResultStage, + PutOperationStage completeStage, Backplane backplane) { - this.storage = (CASFileCache) storage; - this.inputFetchStage = (InputFetchStage) inputFetchStage; - this.executeActionStage = (ExecuteActionStage) executeActionStage; - this.context = context; + this.storage = storage; + this.matchStage = matchStage; + this.inputFetchStage = inputFetchStage; + this.executeActionStage = executeActionStage; + this.reportResultStage = reportResultStage; this.completeStage = (PutOperationStage) completeStage; this.backplane = backplane; } + private StageInformation unaryStageInformation(String name, @Nullable String operationName) { + StageInformation.Builder builder = + StageInformation.newBuilder().setName(name).setSlotsConfigured(1); + if (operationName != null) { + builder.setSlotsUsed(1).addOperationNames(operationName); + } + return builder.build(); + } + + private StageInformation superscalarStageInformation(SuperscalarPipelineStage stage) { + return StageInformation.newBuilder() + .setName(stage.getName()) + .setSlotsConfigured(stage.getWidth()) + .setSlotsUsed(stage.getSlotUsage()) + .addAllOperationNames(stage.getOperationNames()) + .build(); + } + @Override public void getWorkerProfile( WorkerProfileRequest request, StreamObserver responseObserver) { // get usage of CASFileCache - WorkerProfileMessage.Builder replyBuilder = - WorkerProfileMessage.newBuilder() - .setCasSize(storage.size()) - .setCasEntryCount(storage.entryCount()) - .setCasMaxSize(storage.maxSize()) - .setCasMaxEntrySize(storage.maxEntrySize()) - .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) - .setCasDirectoryEntryCount(storage.directoryStorageCount()) - .setCasEvictedEntryCount(storage.getEvictedCount()) - .setCasEvictedEntrySize(storage.getEvictedSize()); + WorkerProfileMessage.Builder replyBuilder = WorkerProfileMessage.newBuilder(); + + // FIXME deliver full local storage chain + if (storage != null) { + replyBuilder + .setCasSize(storage.size()) + .setCasEntryCount(storage.entryCount()) + .setCasMaxSize(storage.maxSize()) + .setCasMaxEntrySize(storage.maxEntrySize()) + .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) + .setCasDirectoryEntryCount(storage.directoryStorageCount()) + .setCasEvictedEntryCount(storage.getEvictedCount()) + .setCasEvictedEntrySize(storage.getEvictedSize()); + } // get slots configured and used of superscalar stages + // prefer reverse order to avoid double counting if possible + // these stats are not consistent across their sampling and will + // produce: slots that are not consistent with operations, operations + // in multiple stages even in reverse due to claim progress + // in short: this is for monitoring, not for guaranteed consistency checks + String reportResultOperation = reportResultStage.getOperationName(); + String matchOperation = matchStage.getOperationName(); replyBuilder - .addStages( - StageInformation.newBuilder() - .setName("InputFetchStage") - .setSlotsConfigured(context.getInputFetchStageWidth()) - .setSlotsUsed(inputFetchStage.getSlotUsage()) - .build()) - .addStages( - StageInformation.newBuilder() - .setName("ExecuteActionStage") - .setSlotsConfigured(context.getExecuteStageWidth()) - .setSlotsUsed(executeActionStage.getSlotUsage()) - .build()); + .addStages(unaryStageInformation(reportResultStage.getName(), reportResultOperation)) + .addStages(superscalarStageInformation(executeActionStage)) + .addStages(superscalarStageInformation(inputFetchStage)) + .addStages(unaryStageInformation(matchStage.getName(), matchOperation)); // get average time costs on each stage OperationStageDurations[] durations = completeStage.getAverageTimeCostPerStage(); diff --git a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto index 56216577f0..cf3ce805b7 100644 --- a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto +++ b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto @@ -616,6 +616,8 @@ message StageInformation { // number of slots used for this stage int32 slots_used = 3; + + repeated string operation_names = 4; } message WorkerProfileMessage { diff --git a/src/test/java/build/buildfarm/cas/BUILD b/src/test/java/build/buildfarm/cas/BUILD index ff7774e067..fc127b1ea6 100644 --- a/src/test/java/build/buildfarm/cas/BUILD +++ b/src/test/java/build/buildfarm/cas/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["**/*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/cas", diff --git a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java index 1bd1791999..caaab02b35 100644 --- a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java @@ -19,8 +19,11 @@ import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; +import static java.lang.Thread.State.TERMINATED; +import static java.lang.Thread.State.WAITING; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.junit.Assert.fail; import static org.mockito.Mockito.any; @@ -58,6 +61,7 @@ import com.google.common.collect.Maps; import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; +import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; @@ -77,6 +81,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -147,7 +152,6 @@ public void setUp() throws IOException, InterruptedException { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -241,7 +245,8 @@ public void putDirectoryCreatesTree() throws IOException, InterruptedException { subdirDigest, subDirectory); Path dirPath = getInterruptiblyOrIOException( - fileCache.putDirectory(dirDigest, directoriesIndex, putService)); + fileCache.putDirectory(dirDigest, directoriesIndex, putService)) + .getPath(); assertThat(Files.isDirectory(dirPath)).isTrue(); assertThat(Files.exists(dirPath.resolve("file"))).isTrue(); assertThat(Files.isDirectory(dirPath.resolve("subdir"))).isTrue(); @@ -1111,7 +1116,6 @@ public void copyExternalInputRetries() throws Exception { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1175,7 +1179,6 @@ public void newInputThrowsNoSuchFileExceptionWithoutDelegate() throws Exception /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1209,6 +1212,114 @@ protected InputStream newExternalInput( assertThat(expected).isNotNull(); } + @Test + public void testConcurrentWrites() throws Exception { + ByteString blob = ByteString.copyFromUtf8("concurrent write"); + Digest digest = DIGEST_UTIL.compute(blob); + UUID uuid = UUID.randomUUID(); + // The same instance of Write will be passed to both the threads, so that the both threads + // try to get same output stream. + Write write = + fileCache.getWrite( + Compressor.Value.IDENTITY, digest, uuid, RequestMetadata.getDefaultInstance()); + + CyclicBarrier barrier = new CyclicBarrier(3); + + Thread write1 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + barrier.await(); // let both the threads get same write stream. + writeStreamObserver.ownStream(); // let other thread get the ownership of stream + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "FirstRequest"); + Thread write2 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + writeStreamObserver.ownStream(); // this thread will get the ownership of stream + barrier.await(); // let both the threads get same write stream. + while (write1.getState() != WAITING) ; // wait for first request to go in wait state + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "SecondRequest"); + write1.start(); + write2.start(); + barrier.await(); // let both the requests reach the critical section + + // Wait for each write operation to complete, allowing a maximum of 100ms per write. + // Note: A 100ms wait time allowed 1000 * 8 successful test runs. + // In certain scenario, even this wait time may not be enough and test still be called flaky. + // But setting wait time 0 may cause test to wait forever (if there is issue in code) and the + // build might fail with timeout error. + write1.join(100); + write2.join(100); + + assertThat(write1.getState()).isEqualTo(TERMINATED); + assertThat(write2.getState()).isEqualTo(TERMINATED); + } + + static class ConcurrentWriteStreamObserver { + Write write; + FeedbackOutputStream out; + + ConcurrentWriteStreamObserver(Write write) { + this.write = write; + } + + void registerCallback() { + Futures.addCallback( + write.getFuture(), + new FutureCallback() { + @Override + public void onSuccess(Long committedSize) { + commit(); + } + + @Override + public void onFailure(Throwable t) { + // do nothing + } + }, + directExecutor()); + } + + synchronized void ownStream() throws Exception { + this.out = write.getOutput(10, MILLISECONDS, () -> {}); + } + /** + * Request 1 may invoke this method for request 2 or vice-versa via callback on + * write.getFuture(). Synchronization is necessary to prevent conflicts when this method is + * called simultaneously by different threads. + */ + synchronized void commit() { + // critical section + } + + void write(ByteString data) throws IOException { + data.writeTo(out); + } + + void close() throws IOException { + out.close(); + } + } + @RunWith(JUnit4.class) public static class NativeFileDirsIndexInMemoryCASFileCacheTest extends CASFileCacheTest { public NativeFileDirsIndexInMemoryCASFileCacheTest() throws IOException { diff --git a/src/test/java/build/buildfarm/common/BUILD b/src/test/java/build/buildfarm/common/BUILD index bda4f4243f..70ff4abc94 100644 --- a/src/test/java/build/buildfarm/common/BUILD +++ b/src/test/java/build/buildfarm/common/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/common", diff --git a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java index 5301b82608..547f81b2ca 100644 --- a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java +++ b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java @@ -30,7 +30,6 @@ public void testHandleGrpcMetricIntercepts_withLatencyBucket() { grpcMetrics.setEnabled(true); grpcMetrics.setProvideLatencyHistograms(true); grpcMetrics.setLatencyBuckets(new double[] {1, 2, 3}); - GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, grpcMetrics); verify(serverBuilder, times(1)).intercept(any(MonitoringServerInterceptor.class)); } diff --git a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java similarity index 89% rename from src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java rename to src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java index 719806b9f7..319a4a5cc5 100644 --- a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java @@ -17,12 +17,13 @@ import static build.buildfarm.common.Actions.checkPreconditionFailure; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; -import static build.buildfarm.instance.server.AbstractServerInstance.DIRECTORY_NOT_SORTED; -import static build.buildfarm.instance.server.AbstractServerInstance.DUPLICATE_DIRENT; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_COMMAND; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; +import static build.buildfarm.instance.server.NodeInstance.DIRECTORY_NOT_SORTED; +import static build.buildfarm.instance.server.NodeInstance.DUPLICATE_DIRENT; +import static build.buildfarm.instance.server.NodeInstance.INVALID_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.SYMLINK_TARGET_ABSOLUTE; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; import static org.mockito.Mockito.any; @@ -43,6 +44,7 @@ import build.bazel.remote.execution.v2.OutputDirectory; import build.bazel.remote.execution.v2.Platform; import build.bazel.remote.execution.v2.RequestMetadata; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.cas.ContentAddressableStorage; @@ -97,10 +99,10 @@ @RunWith(JUnit4.class) @Log -public class AbstractServerInstanceTest { +public class NodeInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); - static class DummyServerInstance extends AbstractServerInstance { + static class DummyServerInstance extends NodeInstance { DummyServerInstance( ContentAddressableStorage contentAddressableStorage, ActionCache actionCache) { super( @@ -259,7 +261,7 @@ public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() @Test public void duplicateFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -270,6 +272,7 @@ public void duplicateFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFile=*/ file -> {}, /* onInputDirectorie=*/ directory -> {}, /* onInputDigest=*/ digest -> {}, @@ -287,7 +290,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -304,6 +307,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(Digest.getDefaultInstance(), emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -316,7 +320,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { @Test public void unsortedFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -327,6 +331,7 @@ public void unsortedFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -344,7 +349,7 @@ public void duplicateDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -361,6 +366,7 @@ public void duplicateDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -378,7 +384,7 @@ public void unsortedDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -395,6 +401,7 @@ public void unsortedDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -407,10 +414,52 @@ public void unsortedDirectoryInputIsInvalid() { assertThat(violation.getDescription()).isEqualTo(DIRECTORY_NOT_SORTED); } + @Test + public void shouldValidateIfSymlinkTargetAbsolute() { + // invalid for disallowed + PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); + Directory absoluteSymlinkDirectory = + Directory.newBuilder() + .addSymlinks(SymlinkNode.newBuilder().setName("foo").setTarget("/root/secret").build()) + .build(); + NodeInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(1); + Violation violation = preconditionFailure.getViolationsList().get(0); + assertThat(violation.getType()).isEqualTo(VIOLATION_TYPE_INVALID); + assertThat(violation.getSubject()).isEqualTo("/: foo -> /root/secret"); + assertThat(violation.getDescription()).isEqualTo(SYMLINK_TARGET_ABSOLUTE); + + // valid for allowed + preconditionFailure = PreconditionFailure.newBuilder(); + NodeInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ true, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(0); + } + @Test public void nestedOutputDirectoriesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of(), @@ -427,7 +476,7 @@ public void nestedOutputDirectoriesAreInvalid() { @Test public void outputDirectoriesContainingOutputFilesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo/bar"), @@ -444,7 +493,7 @@ public void outputDirectoriesContainingOutputFilesAreInvalid() { @Test public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo"), @@ -460,7 +509,7 @@ public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { @Test public void emptyArgumentListIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -480,7 +529,7 @@ public void emptyArgumentListIsInvalid() { @Test public void absoluteWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -500,7 +549,7 @@ public void absoluteWorkingDirectoryIsInvalid() { @Test public void undeclaredWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); Digest inputRootDigest = DIGEST_UTIL.compute(Directory.getDefaultInstance()); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); @@ -541,12 +590,13 @@ public void multipleIdenticalDirectoryMissingAreAllPreconditionFailures() { .setDigest(missingDirectoryDigest) .build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -602,12 +652,13 @@ public void validationRevisitReplicatesPreconditionFailures() { DirectoryNode.newBuilder().setName("bar").setDigest(fooDigest).build(), DirectoryNode.newBuilder().setName("foo").setDigest(fooDigest).build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(fooDigest, foo), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -670,8 +721,7 @@ public void outputDirectoriesFilesAreEnsuredPresent() throws Exception { .build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); ActionCache actionCache = mock(ActionCache.class); - AbstractServerInstance instance = - new DummyServerInstance(contentAddressableStorage, actionCache); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, actionCache); Tree tree = Tree.newBuilder() @@ -731,7 +781,7 @@ public void fetchBlobWriteCompleteIsSuccess() throws Exception { Digest expectedDigest = contentDigest.toBuilder().setSizeBytes(-1).build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); - AbstractServerInstance instance = new DummyServerInstance(contentAddressableStorage, null); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, null); RequestMetadata requestMetadata = RequestMetadata.getDefaultInstance(); Write write = mock(Write.class); diff --git a/src/test/java/build/buildfarm/instance/shard/BUILD b/src/test/java/build/buildfarm/instance/shard/BUILD index 6a501a1b68..2f08b1003d 100644 --- a/src/test/java/build/buildfarm/instance/shard/BUILD +++ b/src/test/java/build/buildfarm/instance/shard/BUILD @@ -1,7 +1,158 @@ java_test( - name = "tests", + name = "DispatchedMonitorTest", size = "small", - srcs = glob(["*.java"]), + srcs = [ + "DispatchedMonitorTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardBackplaneTest", + size = "small", + srcs = [ + "RedisShardBackplaneTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardSubscriberTest", + size = "small", + srcs = [ + "RedisShardSubscriberTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "ServerInstanceTest", + size = "small", + srcs = [ + "ServerInstanceTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "TimedWatcherTest", + size = "small", + srcs = [ + "TimedWatcherTest.java", + "UnobservableWatcher.java", + ], data = ["//examples:example_configs"], test_class = "build.buildfarm.AllTests", deps = [ @@ -31,3 +182,59 @@ java_test( "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", ], ) + +java_test( + name = "UtilTest", + size = "small", + srcs = [ + "UnobservableWatcher.java", + "UtilTest.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "JedisCasWorkerMapTest", + size = "small", + srcs = [ + "JedisCasWorkerMapTest.java", + ], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/redis", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@maven//:com_github_fppt_jedis_mock", + "@maven//:com_google_truth_truth", + ], +) diff --git a/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java new file mode 100644 index 0000000000..caa69536c1 --- /dev/null +++ b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java @@ -0,0 +1,63 @@ +package build.buildfarm.instance.shard; + +import static com.google.common.truth.Truth.assertThat; + +import build.bazel.remote.execution.v2.Digest; +import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.redis.RedisClient; +import com.github.fppt.jedismock.RedisServer; +import com.github.fppt.jedismock.server.ServiceOptions; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import redis.clients.jedis.HostAndPort; +import redis.clients.jedis.JedisCluster; + +@RunWith(JUnit4.class) +public class JedisCasWorkerMapTest { + + private static final String CAS_PREFIX = "ContentAddressableStorage"; + + private RedisServer redisServer; + private RedisClient redisClient; + private JedisCasWorkerMap jedisCasWorkerMap; + + @Before + public void setup() throws IOException { + redisServer = + RedisServer.newRedisServer() + .setOptions(ServiceOptions.defaultOptions().withClusterModeEnabled()) + .start(); + redisClient = + new RedisClient( + new JedisCluster( + Collections.singleton( + new HostAndPort(redisServer.getHost(), redisServer.getBindPort())))); + jedisCasWorkerMap = new JedisCasWorkerMap(CAS_PREFIX, 60); + } + + @Test + public void testSetExpire() throws IOException { + Digest testDigest1 = Digest.newBuilder().setHash("abc").build(); + Digest testDigest2 = Digest.newBuilder().setHash("xyz").build(); + + String casKey1 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest1); + String casKey2 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest2); + + redisClient.run(jedis -> jedis.sadd(casKey1, "worker1")); + jedisCasWorkerMap.setExpire(redisClient, Arrays.asList(testDigest1, testDigest2)); + + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey1))).isGreaterThan(0L); + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey2))).isEqualTo(-2L); + } + + @After + public void tearDown() throws IOException { + redisServer.stop(); + } +} diff --git a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java index 51c7c0568d..a77e74a3bd 100644 --- a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java +++ b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java @@ -59,7 +59,6 @@ @RunWith(JUnit4.class) public class RedisShardBackplaneTest { - private RedisShardBackplane backplane; private BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); @Mock Supplier mockJedisClusterFactory; @@ -67,12 +66,20 @@ public class RedisShardBackplaneTest { @Before public void setUp() throws IOException { configs.getBackplane().setOperationExpire(10); - configs.getBackplane().setSubscribeToBackplane(false); - configs.getBackplane().setRunFailsafeOperation(false); configs.getBackplane().setQueues(new Queue[] {}); MockitoAnnotations.initMocks(this); } + public RedisShardBackplane createBackplane(String name) { + return new RedisShardBackplane( + name, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + o -> o, + o -> o, + mockJedisClusterFactory); + } + @Test public void workersWithInvalidProtobufAreRemoved() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); @@ -81,9 +88,7 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { .thenReturn(ImmutableMap.of("foo", "foo")); when(jedisCluster.hdel(configs.getBackplane().getWorkersHashName() + "_storage", "foo")) .thenReturn(1L); - backplane = - new RedisShardBackplane( - "invalid-protobuf-worker-removed-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invalid-protobuf-worker-removed-test"); backplane.start("startTime/test:0000"); assertThat(backplane.getStorageWorkers()).isEmpty(); @@ -100,12 +105,10 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { assertThat(workerChange.getTypeCase()).isEqualTo(WorkerChange.TypeCase.REMOVE); } - void verifyChangePublished(JedisCluster jedis) throws IOException { + OperationChange verifyChangePublished(String channel, JedisCluster jedis) throws IOException { ArgumentCaptor changeCaptor = ArgumentCaptor.forClass(String.class); - verify(jedis, times(1)).publish(eq(backplane.operationChannel("op")), changeCaptor.capture()); - OperationChange opChange = parseOperationChange(changeCaptor.getValue()); - assertThat(opChange.hasReset()).isTrue(); - assertThat(opChange.getReset().getOperation().getName()).isEqualTo("op"); + verify(jedis, times(1)).publish(eq(channel), changeCaptor.capture()); + return parseOperationChange(changeCaptor.getValue()); } String operationName(String name) { @@ -116,9 +119,7 @@ String operationName(String name) { public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "prequeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("prequeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -136,32 +137,34 @@ public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getPreQueuedOperationsListName(), JsonFormat.printer().print(executeEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void queuingPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; backplane.queueing(opName); verify(mockJedisClusterFactory, times(1)).get(); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -181,7 +184,10 @@ public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getQueuedOperationsListName(), JsonFormat.printer().print(queueEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -195,9 +201,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // ARRANGE @@ -210,8 +214,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) @@ -251,9 +254,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // Assume the operation queue is already populated from a first re-queue. @@ -265,8 +266,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) @@ -299,9 +299,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() public void completeOperationUndispatches() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "complete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("complete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -318,9 +316,7 @@ public void completeOperationUndispatches() throws IOException { public void deleteOperationDeletesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "delete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("delete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -331,7 +327,10 @@ public void deleteOperationDeletesAndPublishes() throws IOException { verify(jedisCluster, times(1)) .hdel(configs.getBackplane().getDispatchedOperationsHashName(), opName); verify(jedisCluster, times(1)).del(operationName(opName)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -342,9 +341,7 @@ public void invocationsCanBeBlacklisted() throws IOException { configs.getBackplane().getInvocationBlacklistPrefix() + ":" + toolInvocationId; when(jedisCluster.exists(invocationBlacklistKey)).thenReturn(true); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "invocation-blacklist-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invocation-blacklist-test"); backplane.start("startTime/test:0000"); assertThat( @@ -362,8 +359,7 @@ public void invocationsCanBeBlacklisted() throws IOException { public void testGetWorkersStartTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("workers-starttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("workers-starttime-test"); backplane.start("startTime/test:0000"); Set workerNames = ImmutableSet.of("worker1", "worker2", "missing_worker"); @@ -387,8 +383,7 @@ public void testGetWorkersStartTime() throws IOException { public void getDigestInsertTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("startTime/test:0000"); long ttl = 3600L; long expirationInSecs = configs.getBackplane().getCasExpire(); @@ -412,8 +407,7 @@ public void testAddWorker() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); when(jedisCluster.hset(anyString(), anyString(), anyString())).thenReturn(1L); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("addWorker/test:0000"); backplane.addWorker(shardWorker); verify(jedisCluster, times(1)) diff --git a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java similarity index 97% rename from src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java rename to src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java index 908b3c826b..87554bc3e0 100644 --- a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java @@ -20,9 +20,9 @@ import static build.buildfarm.common.Actions.invalidActionVerboseMessage; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_PLATFORM; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_ACTION; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.INVALID_PLATFORM; +import static build.buildfarm.instance.server.NodeInstance.MISSING_ACTION; +import static build.buildfarm.instance.server.NodeInstance.MISSING_COMMAND; import static com.google.common.base.Predicates.notNull; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; @@ -31,9 +31,13 @@ import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.SECONDS; import static org.mockito.AdditionalAnswers.answer; +import static org.mockito.ArgumentMatchers.anyIterable; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atLeast; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; @@ -81,6 +85,7 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.longrunning.Operation; import com.google.protobuf.Any; @@ -96,6 +101,7 @@ import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -122,14 +128,14 @@ import org.mockito.stubbing.Answer; @RunWith(JUnit4.class) -public class ShardInstanceTest { +public class ServerInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); private static final long QUEUE_TEST_TIMEOUT_SECONDS = 3; private static final Duration DEFAULT_TIMEOUT = Durations.fromSeconds(60); private static final Command SIMPLE_COMMAND = Command.newBuilder().addAllArguments(ImmutableList.of("true")).build(); - private ShardInstance instance; + private ServerInstance instance; private Map blobDigests; @Mock private Backplane mockBackplane; @@ -146,7 +152,7 @@ public void setUp() throws InterruptedException { blobDigests = Maps.newHashMap(); ActionCache actionCache = new ShardActionCache(10, mockBackplane, newDirectExecutorService()); instance = - new ShardInstance( + new ServerInstance( "shard", DIGEST_UTIL, mockBackplane, @@ -1055,7 +1061,6 @@ public void containsBlobReflectsWorkerWithUnknownSize() throws Exception { @Test public void findMissingBlobsTest_ViaBackPlane() throws Exception { - Set activeWorkers = ImmutableSet.of("worker1", "worker2", "worker3"); Set expiredWorkers = ImmutableSet.of("workerX", "workerY", "workerZ"); Set imposterWorkers = ImmutableSet.of("imposter1", "imposter2", "imposter3"); @@ -1114,8 +1119,12 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { buildfarmConfigs.getServer().setFindMissingBlobsViaBackplane(true); Set activeAndImposterWorkers = Sets.newHashSet(Iterables.concat(activeWorkers, imposterWorkers)); + when(mockBackplane.getStorageWorkers()).thenReturn(activeAndImposterWorkers); when(mockBackplane.getBlobDigestsWorkers(any(Iterable.class))).thenReturn(digestAndWorkersMap); + when(mockInstanceLoader.load(anyString())).thenReturn(mockWorkerInstance); + when(mockWorkerInstance.findMissingBlobs(anyIterable(), any(RequestMetadata.class))) + .thenReturn(Futures.immediateFuture(new ArrayList<>())); long serverStartTime = 1686951033L; // june 15th, 2023 Map workersStartTime = new HashMap<>(); @@ -1138,6 +1147,10 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { Iterables.concat(missingDigests, digestAvailableOnImposters); assertThat(actualMissingDigests).containsExactlyElementsIn(expectedMissingDigests); + verify(mockWorkerInstance, atMost(3)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); + verify(mockWorkerInstance, atLeast(1)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); for (Digest digest : actualMissingDigests) { assertThat(digest).isNotIn(availableDigests); diff --git a/src/test/java/build/buildfarm/metrics/BUILD b/src/test/java/build/buildfarm/metrics/BUILD index 8e5d3cace9..c32955e20d 100644 --- a/src/test/java/build/buildfarm/metrics/BUILD +++ b/src/test/java/build/buildfarm/metrics/BUILD @@ -8,14 +8,10 @@ java_test( "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", "@googleapis//:google_rpc_error_details_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", "@maven//:com_google_guava_guava", "@maven//:com_google_jimfs_jimfs", "@maven//:com_google_protobuf_protobuf_java", diff --git a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java index 21c9685efd..f30adb55c6 100644 --- a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java +++ b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java @@ -22,7 +22,6 @@ import build.bazel.remote.execution.v2.RequestMetadata; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Metrics; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import build.buildfarm.v1test.OperationRequestMetadata; import com.google.longrunning.Operation; @@ -69,7 +68,7 @@ public class MetricsPublisherTest { public void setUp() throws IOException { configs.getServer().setCloudRegion("test"); configs.getServer().setClusterId("buildfarm-test"); - configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.AWS); + configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.LOG); } @Test @@ -81,7 +80,7 @@ public void publishCompleteMetricsTest() throws InvalidProtocolBufferException { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - AwsMetricsPublisher metricsPublisher = new AwsMetricsPublisher(); + LogMetricsPublisher metricsPublisher = new LogMetricsPublisher(); assertThat( AbstractMetricsPublisher.formatRequestMetadataToJson( metricsPublisher.populateRequestMetadata(operation, defaultRequestMetadata))) @@ -110,7 +109,7 @@ public void publishMetricsWithNoExecuteResponseTest() { Operation operation = defaultOperation.toBuilder().setMetadata(Any.pack(defaultExecuteOperationMetadata)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -119,7 +118,7 @@ public void publishMetricsWithNoExecuteOperationMetadataTest() { Operation operation = defaultOperation.toBuilder().setResponse(Any.pack(defaultExecuteResponse)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -135,7 +134,7 @@ public void preconditionFailureTest() { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } diff --git a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java index bc7acd00a4..47f25cdf79 100644 --- a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java +++ b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java @@ -19,8 +19,10 @@ import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.HashMultimap; import com.google.common.collect.SetMultimap; +import java.util.concurrent.Semaphore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -52,10 +54,12 @@ public class DequeueMatchEvaluatorTest { public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder().setPlatform(Platform.newBuilder()).build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -70,6 +74,7 @@ public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "11"); QueueEntry entry = @@ -81,7 +86,8 @@ public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has more cores than the min-cores requested @@ -98,6 +104,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { // ARRANGE configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -109,7 +116,8 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker rejects because it has less cores than the min-cores requested @@ -123,6 +131,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -136,7 +145,8 @@ public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Excepti .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has the same cores as the min-cores requested @@ -153,6 +163,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder() @@ -163,7 +174,8 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isFalse(); @@ -172,7 +184,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -181,9 +193,151 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resource should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(1)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsMultipleResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(2)); + resourceSet.resources.put("BAR", new Semaphore(4)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("2"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(2); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(4); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(2); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should fail to claim, and the existing amount should be + // the same. + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationFailsToClaimSameAmountRemains() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(50)); + resourceSet.resources.put("BAR", new Semaphore(100)); + resourceSet.resources.put("BAZ", new Semaphore(200)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("20")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("101")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAZ").setValue("20"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + // The same amount are returned. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + } } diff --git a/src/test/java/build/buildfarm/worker/PipelineTest.java b/src/test/java/build/buildfarm/worker/PipelineTest.java index 713f1b0345..f414b8cefe 100644 --- a/src/test/java/build/buildfarm/worker/PipelineTest.java +++ b/src/test/java/build/buildfarm/worker/PipelineTest.java @@ -53,7 +53,7 @@ public void stageThreadReturnCompletesJoin() throws InterruptedException { public void run() {} }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } @@ -68,7 +68,7 @@ public void run() { } }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } } diff --git a/src/test/java/build/buildfarm/worker/StubWorkerContext.java b/src/test/java/build/buildfarm/worker/StubWorkerContext.java index d288d67a6f..7ccf1182d3 100644 --- a/src/test/java/build/buildfarm/worker/StubWorkerContext.java +++ b/src/test/java/build/buildfarm/worker/StubWorkerContext.java @@ -222,4 +222,9 @@ public ResourceLimits commandExecutionSettings(Command command) { public boolean shouldErrorOperationOnRemainingResources() { throw new UnsupportedOperationException(); } + + @Override + public void returnLocalResources(QueueEntry queueEntry) { + throw new UnsupportedOperationException(); + } } diff --git a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java index 60cb0d8be9..3716513102 100644 --- a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java +++ b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java @@ -72,6 +72,11 @@ protected int claimsRequired(OperationContext operationContext) { boolean isFull() { return claims.size() == width; } + + @Override + public int getSlotUsage() { + return 0; + } } @Test diff --git a/src/test/java/build/buildfarm/worker/shard/BUILD b/src/test/java/build/buildfarm/worker/shard/BUILD index b8a7b31ec6..c254e41fa4 100644 --- a/src/test/java/build/buildfarm/worker/shard/BUILD +++ b/src/test/java/build/buildfarm/worker/shard/BUILD @@ -11,6 +11,7 @@ java_test( "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/worker", + "//src/main/java/build/buildfarm/worker/resources", "//src/main/java/build/buildfarm/worker/shard", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java index dd584c49af..c24e681abb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java +++ b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java @@ -35,6 +35,7 @@ import build.buildfarm.instance.MatchListener; import build.buildfarm.v1test.QueueEntry; import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.ImmutableList; import com.google.protobuf.Duration; import java.util.ArrayList; @@ -105,6 +106,7 @@ WorkerContext createTestContext(Iterable policies) { /* onlyMulticoreTests=*/ false, /* allowBringYourOwnContainer=*/ false, /* errorOperationRemainingResources=*/ false, + /* resourceSet=*/ new LocalResourceSet(), writer); } diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java similarity index 97% rename from src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java rename to src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java index 7a4e40be26..3df73187bb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java +++ b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java @@ -50,19 +50,19 @@ import org.mockito.MockitoAnnotations; @RunWith(JUnit4.class) -public class ShardWorkerInstanceTest { +public class WorkerInstanceTest { private final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); @Mock private Backplane backplane; @Mock private ContentAddressableStorage storage; - private ShardWorkerInstance instance; + private WorkerInstance instance; @Before public void setUp() throws Exception { MockitoAnnotations.initMocks(this); - instance = new ShardWorkerInstance("test", DIGEST_UTIL, backplane, storage); + instance = new WorkerInstance("test", DIGEST_UTIL, backplane, storage); } @SuppressWarnings("unchecked")