From ec4b82cd488e6a01c3b235210a2fd4090d701105 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 26 Sep 2023 13:21:25 -0400 Subject: [PATCH 01/58] Manipulate worker set directly in RSB Avoid dependency on subscriber to update state changes when removing workers. This prevents an NPE which will occur invariably when workers are allowably configured with subscribeToBackplane: false. --- .../build/buildfarm/instance/shard/RedisShardBackplane.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index 200929ecbb..6676accdf3 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -634,7 +634,7 @@ public boolean removeWorker(String name, String reason) throws IOException { .setRemove(WorkerChange.Remove.newBuilder().setSource(source).setReason(reason).build()) .build(); String workerChangeJson = JsonFormat.printer().print(workerChange); - return subscriber.removeWorker(name) + return storageWorkerSet.remove(name) && client.call( jedis -> removeWorkerAndPublish(jedis, name, workerChangeJson, /* storage=*/ true)); } From c48d600e5a2b69dc42b732bd16f556c9e6b4b49a Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 26 Sep 2023 13:24:53 -0400 Subject: [PATCH 02/58] Remove publishTtlMetric option The individual metric controls for ttl are not necessary either for performance or feature support. Use Files' attributes acquisition mechanism for modified time. --- examples/config.yml | 1 - .../build/buildfarm/cas/cfc/CASFileCache.java | 54 +++++++------------ .../build/buildfarm/common/config/Cas.java | 3 -- .../worker/shard/ShardCASFileCache.java | 2 - .../build/buildfarm/worker/shard/Worker.java | 1 - .../buildfarm/cas/cfc/CASFileCacheTest.java | 3 -- 6 files changed, 20 insertions(+), 44 deletions(-) diff --git a/examples/config.yml b/examples/config.yml index dd165a4d46..6ea0ee1132 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -112,7 +112,6 @@ worker: hexBucketLevels: 0 execRootCopyFallback: false target: - publishTtlMetric: false executeStageWidth: 1 inputFetchStageWidth: 1 inputFetchDeadline: 60 diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index 3897c7268f..6b6e63b4ad 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -145,7 +145,19 @@ public abstract class CASFileCache implements ContentAddressableStorage { Gauge.build().name("cas_size").help("CAS size.").register(); private static final Gauge casEntryCountMetric = Gauge.build().name("cas_entry_count").help("Number of entries in the CAS.").register(); - private static Histogram casTtl; + private static Histogram casTtl = + Histogram.build() + .name("cas_ttl_s") + .buckets( + 3600, // 1 hour + 21600, // 6 hours + 86400, // 1 day + 345600, // 4 days + 604800, // 1 week + 1210000 // 2 weeks + ) + .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") + .register(); private static final Gauge casCopyFallbackMetric = Gauge.build() @@ -160,7 +172,6 @@ public abstract class CASFileCache implements ContentAddressableStorage { private final EntryPathStrategy entryPathStrategy; private final long maxSizeInBytes; private final long maxEntrySizeInBytes; - private final boolean publishTtlMetric; private final boolean execRootFallback; private final DigestUtil digestUtil; private final ConcurrentMap storage; @@ -306,7 +317,6 @@ public CASFileCache( maxEntrySizeInBytes, config.getHexBucketLevels(), config.isFileDirectoriesIndexInMemory(), - config.isPublishTtlMetric(), config.isExecRootCopyFallback(), digestUtil, expireService, @@ -325,7 +335,6 @@ public CASFileCache( long maxEntrySizeInBytes, int hexBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -339,7 +348,6 @@ public CASFileCache( this.root = root; this.maxSizeInBytes = maxSizeInBytes; this.maxEntrySizeInBytes = maxEntrySizeInBytes; - this.publishTtlMetric = publishTtlMetric; this.execRootFallback = execRootFallback; this.digestUtil = digestUtil; this.expireService = expireService; @@ -351,22 +359,6 @@ public CASFileCache( this.delegateSkipLoad = delegateSkipLoad; this.directoriesIndexDbName = directoriesIndexDbName; - if (publishTtlMetric) { - casTtl = - Histogram.build() - .name("cas_ttl_s") - .buckets( - 3600, // 1 hour - 21600, // 6 hours - 86400, // 1 day - 345600, // 4 days - 604800, // 1 week - 1210000 // 2 weeks - ) - .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") - .register(); - } - entryPathStrategy = new HexBucketEntryPathStrategy(root, hexBucketLevels); String directoriesIndexUrl = "jdbc:sqlite:"; @@ -2710,25 +2702,19 @@ private CancellableOutputStream putOrReference( } private void deleteExpiredKey(String key) throws IOException { - // We don't want publishing the metric to delay the deletion of the file. - // We publish the metric only after the file has been deleted. - long createdTime = 0; Path path = getRemovingPath(key); - if (publishTtlMetric) { - createdTime = path.toFile().lastModified(); - } + long createdTimeMs = Files.getLastModifiedTime(path).to(MILLISECONDS); Files.delete(path); - if (publishTtlMetric) { - publishExpirationMetric(createdTime); - } + publishExpirationMetric(createdTimeMs); } - private void publishExpirationMetric(long createdTime) { - long currentTime = new Date().getTime(); - long ttl = currentTime - createdTime; - casTtl.observe(Time.millisecondsToSeconds(ttl)); + private void publishExpirationMetric(long createdTimeMs) { + // TODO introduce ttl clock + long currentTimeMs = new Date().getTime(); + long ttlMs = currentTimeMs - createdTimeMs; + casTtl.observe(Time.millisecondsToSeconds(ttlMs)); } @SuppressWarnings({"ConstantConditions", "ResultOfMethodCallIgnored"}) diff --git a/src/main/java/build/buildfarm/common/config/Cas.java b/src/main/java/build/buildfarm/common/config/Cas.java index 5b16fb2168..93ecdbfaf2 100644 --- a/src/main/java/build/buildfarm/common/config/Cas.java +++ b/src/main/java/build/buildfarm/common/config/Cas.java @@ -30,9 +30,6 @@ public enum TYPE { private String target; private boolean readonly = false; - // Metrics - private boolean publishTtlMetric = false; - public Path getValidPath(Path root) throws ConfigurationException { if (Strings.isNullOrEmpty(path)) { throw new ConfigurationException("Cas cache directory value in config missing"); diff --git a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java index c1a56beb06..b03cbe6f32 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java @@ -38,7 +38,6 @@ class ShardCASFileCache extends CASFileCache { long maxEntrySizeInBytes, int maxBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -53,7 +52,6 @@ class ShardCASFileCache extends CASFileCache { maxEntrySizeInBytes, maxBucketLevels, storeFileDirsIndexInMemory, - publishTtlMetric, execRootFallback, digestUtil, expireService, diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index a33dbaad60..46f0f083f6 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -354,7 +354,6 @@ private ContentAddressableStorage createStorage( // delegate level cas.getHexBucketLevels(), cas.isFileDirectoriesIndexInMemory(), - cas.isPublishTtlMetric(), cas.isExecRootCopyFallback(), digestUtil, removeDirectoryService, diff --git a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java index 1bd1791999..72e167b4df 100644 --- a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java @@ -147,7 +147,6 @@ public void setUp() throws IOException, InterruptedException { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1111,7 +1110,6 @@ public void copyExternalInputRetries() throws Exception { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1175,7 +1173,6 @@ public void newInputThrowsNoSuchFileExceptionWithoutDelegate() throws Exception /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, From 2abad58270b4f89a23a1dacb809cf6ce85d85d43 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Wed, 4 Oct 2023 16:17:38 -0400 Subject: [PATCH 03/58] Config-compatible behavior for publishTtlMetric --- src/main/java/build/buildfarm/common/config/Cas.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/build/buildfarm/common/config/Cas.java b/src/main/java/build/buildfarm/common/config/Cas.java index 93ecdbfaf2..9c671d4958 100644 --- a/src/main/java/build/buildfarm/common/config/Cas.java +++ b/src/main/java/build/buildfarm/common/config/Cas.java @@ -3,7 +3,9 @@ import com.google.common.base.Strings; import java.nio.file.Path; import javax.naming.ConfigurationException; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Cas { @@ -30,6 +32,9 @@ public enum TYPE { private String target; private boolean readonly = false; + @Getter(AccessLevel.NONE) + private boolean publishTtlMetric = false; // deprecated + public Path getValidPath(Path root) throws ConfigurationException { if (Strings.isNullOrEmpty(path)) { throw new ConfigurationException("Cas cache directory value in config missing"); From 10189d628c650fea73484a4470038c3a38b64ba6 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 26 Sep 2023 14:21:10 -0400 Subject: [PATCH 04/58] Correct logging advisements for current Java Java logging definitions must now match java.util.logging.config.file, update these specifications in our README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8dc99874b3..675fe94dbd 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Run via ``` bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: **`configfile`** has to be in [yaml format](https://bazelbuild.github.io/bazel-buildfarm/docs/configuration). @@ -43,7 +43,7 @@ Run via ``` bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: @@ -68,13 +68,13 @@ You can use typical Java logging configuration to filter these results and obser An example `logging.properties` file has been provided at [examples/logging.properties](examples/logging.properties) for use as follows: ``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` and ``` -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` To attach a remote debugger, run the executable with the `--debug=` flag. For example: From 0c30cb057e7a9bfb3850c68608c0085fd488ad24 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 14:44:14 -0400 Subject: [PATCH 05/58] Rename GracefulShutdownTest --- src/main/java/build/buildfarm/tools/BUILD | 6 +++--- .../{GracefulShutdownTest.java => GracefulShutdown.java} | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) rename src/main/java/build/buildfarm/tools/{GracefulShutdownTest.java => GracefulShutdown.java} (93%) diff --git a/src/main/java/build/buildfarm/tools/BUILD b/src/main/java/build/buildfarm/tools/BUILD index 37c6bdb58a..44a4498ef9 100644 --- a/src/main/java/build/buildfarm/tools/BUILD +++ b/src/main/java/build/buildfarm/tools/BUILD @@ -185,9 +185,9 @@ java_binary( ) java_binary( - name = "GracefulShutdownTest", - srcs = ["GracefulShutdownTest.java"], - main_class = "build.buildfarm.tools.GracefulShutdownTest", + name = "GracefulShutdown", + srcs = ["GracefulShutdown.java"], + main_class = "build.buildfarm.tools.GracefulShutdown", visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common/grpc", diff --git a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java b/src/main/java/build/buildfarm/tools/GracefulShutdown.java similarity index 93% rename from src/main/java/build/buildfarm/tools/GracefulShutdownTest.java rename to src/main/java/build/buildfarm/tools/GracefulShutdown.java index 85ae5ab78d..689edfeff1 100644 --- a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java +++ b/src/main/java/build/buildfarm/tools/GracefulShutdown.java @@ -23,9 +23,9 @@ import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.ManagedChannel; -class GracefulShutdownTest { +class GracefulShutdown { /** - * Example command: GracefulShutdownTest ShutDown workerIp buildfarm-endpoint + * Example command: GracefulShutdown ShutDown workerIp buildfarm-endpoint * * @param args */ @@ -48,7 +48,7 @@ private static void shutDownGracefully(String[] args) { } /** - * Example command: GracefulShutdownTest PrepareWorker WorkerIp:port + * Example command: GracefulShutdown PrepareWorker WorkerIp:port * * @param args */ @@ -65,7 +65,7 @@ private static void prepareWorkerForShutDown(String[] args) { } /** - * Example command: GracefulShutdownTest DisableProtection WorkerIp buildfarm_endpoint + * Example command: GracefulShutdown DisableProtection WorkerIp buildfarm_endpoint * * @param args */ From 5a0d8d606ba62d6a801247d9bc52a84761d95a03 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 14:52:16 -0400 Subject: [PATCH 06/58] Remove WebController --- .../common/config/BuildfarmConfigs.java | 1 - .../build/buildfarm/common/config/WebUI.java | 40 --- src/main/java/build/buildfarm/server/BUILD | 3 - .../buildfarm/server/BuildFarmServer.java | 4 - .../build/buildfarm/server/controllers/BUILD | 46 --- .../server/controllers/WebController.java | 291 ------------------ 6 files changed, 385 deletions(-) delete mode 100644 src/main/java/build/buildfarm/common/config/WebUI.java delete mode 100644 src/main/java/build/buildfarm/server/controllers/BUILD delete mode 100644 src/main/java/build/buildfarm/server/controllers/WebController.java diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 68106d56c7..1a15e40803 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -39,7 +39,6 @@ public final class BuildfarmConfigs { private Server server = new Server(); private Backplane backplane = new Backplane(); private Worker worker = new Worker(); - private WebUI ui = new WebUI(); private ExecutionWrappers executionWrappers = new ExecutionWrappers(); private BuildfarmConfigs() {} diff --git a/src/main/java/build/buildfarm/common/config/WebUI.java b/src/main/java/build/buildfarm/common/config/WebUI.java deleted file mode 100644 index 443d210691..0000000000 --- a/src/main/java/build/buildfarm/common/config/WebUI.java +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.common.config; - -import lombok.Data; - -/** - * @class WebUI - * @brief Settings for buildfarm's web UI. - * @details Buildfarm provides a web frontend for developers to introspect builds. - */ -@Data -public class WebUI { - /** - * @field enable - * @brief Whether to enable the web frontend. - * @details When disabled there will be no ports opened or routes available. - */ - public boolean enable = false; - - /** - * @field port - * @brief HTTP port for the web frontend. - * @details 8080 is useful for local testing since port 80 requires sudo. We choose the following - * default since many ports are blocked in upstream CI. - */ - public String port = "8982"; -} diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index b05d2e5ef7..0d6d396dc8 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -14,7 +14,6 @@ java_library( "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/shard", "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/server/controllers:WebController", "//src/main/java/build/buildfarm/server/services", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:com_github_pcj_google_options", @@ -29,10 +28,8 @@ java_library( "@maven//:org_springframework_boot_spring_boot", "@maven//:org_springframework_boot_spring_boot_autoconfigure", "@maven//:org_springframework_boot_spring_boot_starter_thymeleaf", - "@maven//:org_springframework_boot_spring_boot_starter_web", "@maven//:org_springframework_spring_beans", "@maven//:org_springframework_spring_context", "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", ], ) diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index d0d26a6f52..e82928e934 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -29,7 +29,6 @@ import build.buildfarm.instance.Instance; import build.buildfarm.instance.shard.ShardInstance; import build.buildfarm.metrics.prometheus.PrometheusPublisher; -import build.buildfarm.server.controllers.WebController; import build.buildfarm.server.services.ActionCacheService; import build.buildfarm.server.services.AdminService; import build.buildfarm.server.services.CapabilitiesService; @@ -143,7 +142,6 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName checkState(!stopping, "must not call start after stop"); instance.start(publicName); - WebController.setInstance((ShardInstance) instance); server.start(); healthStatusManager.setStatus( @@ -214,8 +212,6 @@ public static void main(String[] args) throws ConfigurationException { // Disable Logback System.setProperty("org.springframework.boot.logging.LoggingSystem", "none"); - springConfig.put("ui.frontend.enable", configs.getUi().isEnable()); - springConfig.put("server.port", configs.getUi().getPort()); app.setDefaultProperties(springConfig); try { diff --git a/src/main/java/build/buildfarm/server/controllers/BUILD b/src/main/java/build/buildfarm/server/controllers/BUILD deleted file mode 100644 index 1e523bcef5..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/BUILD +++ /dev/null @@ -1,46 +0,0 @@ -java_library( - name = "WebController", - srcs = ["WebController.java"], - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common", - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/common/grpc", - "//src/main/java/build/buildfarm/common/resources", - "//src/main/java/build/buildfarm/common/resources:resource_java_proto", - "//src/main/java/build/buildfarm/common/services", - "//src/main/java/build/buildfarm/instance", - "//src/main/java/build/buildfarm/instance/shard", - "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/operations", - "//src/main/java/build/buildfarm/server/services", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_error_details_proto", - "@maven//:com_github_pcj_google_options", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:com_googlecode_json_simple_json_simple", - "@maven//:com_jayway_jsonpath_json_path", - "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_protobuf", - "@maven//:io_grpc_grpc_services", - "@maven//:io_prometheus_simpleclient", - "@maven//:javax_annotation_javax_annotation_api", - "@maven//:org_bouncycastle_bcprov_jdk15on", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_web", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/server/controllers/WebController.java b/src/main/java/build/buildfarm/server/controllers/WebController.java deleted file mode 100644 index 5ed5c8250e..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/WebController.java +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.controllers; - -import build.bazel.remote.execution.v2.ExecuteOperationMetadata; -import build.bazel.remote.execution.v2.ExecuteResponse; -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.instance.shard.ShardInstance; -import build.buildfarm.operations.EnrichedOperation; -import build.buildfarm.v1test.CompletedOperationMetadata; -import build.buildfarm.v1test.ExecutingOperationMetadata; -import build.buildfarm.v1test.QueuedOperationMetadata; -import com.google.longrunning.Operation; -import com.google.protobuf.Any; -import com.google.protobuf.Duration; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.Timestamp; -import com.google.protobuf.util.Durations; -import com.google.protobuf.util.JsonFormat; -import com.google.protobuf.util.Timestamps; -import com.google.rpc.PreconditionFailure; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.stereotype.Controller; -import org.springframework.ui.Model; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; - -// An HTTP frontend for providing a web UI in buildfarm. Its primarily used by developers so that -// they can introspect their build inovocations. Standalone tools that communicate over GRPC are -// sometimes less accessible. Nonetheless, the APIs should be similar. -// The web frontend can be deployed as part of the servers. However, the controller may also be -// included in other standalone tools if you'd like the frontend decoupled from the servers. -// This controller can provide web content directly through spring boot via thymeleaf. -// The controller also provides raw REST APIs for those wishing to build out their own frontend. -@Log -@Controller -@ConditionalOnProperty("ui.frontend.enable") -public class WebController { - private static ShardInstance instance; - - // Most of the routes require an instance to have their queries fulfilled. - // I wanted to have the instance set in the controller's constructor, but I was having trouble - // doing that with springboot's bean initialization and auto-wiring. - // Particularly because the controller is constructed before the instance is actually created. - // I settled for having the instance static and allowing the server startup to provide the - // instance once ready. - public static void setInstance(ShardInstance instanceIn) { - instance = instanceIn; - } - - // This is typically the starting for a developer looking to introspect their builds. - // When running a bazel client with `--bes_results_url` they will shown this URL route. - // The page is intented to give them a summary of their build invocation and allow them to drill - // down into more details. - @GetMapping("/invocation/{invocationId}") - public String invocation(Model model, @PathVariable String invocationId) { - // We need to look up the user's operations as fast as possible. Scanning all of the stored - // operations and filtering by invocatio ID (i.e. O(n)) does not scale. Instead, the invocation - // ID must be the primary key to their specific list of build operation IDs. We then lookup - // each - // operation by ID (preferably batched). This is technically two backend calls. It could be - // made faster - // at the expense of duplicate information stored in the backend. - Set operationIDs = instance.findOperationsByInvocationId(invocationId); - Iterable> foundOperations = instance.getOperations(operationIDs); - - // Populate the model for the page. - buildInvocationModel(model, invocationId, foundOperations); - - // Render page. - return "invocation"; - } - - private void buildInvocationModel( - Model model, String invocationId, Iterable> foundOperations) { - // Create an array representing table information about all the oprations involved in the - // invocation. - // This is the core content of the page. - JSONArray operationResults = new JSONArray(); - for (Map.Entry entry : foundOperations) { - Operation operation = jsonToOperation(entry.getValue()); - JSONObject obj = new JSONObject(); - obj.put("target", extractTargetId(operation)); - obj.put("mnemonic", extractActionMnemonic(operation)); - obj.put("stage", extractStatus(operation)); - obj.put("duration", extractDuration(operation)); - obj.put("worker", extractWorker(operation)); - - String operationId = entry.getKey(); - String id = operationId.substring(operationId.lastIndexOf('/')).substring(1); - obj.put("operationId", id); - - operationResults.add(obj); - } - - // Populate data to be provided to the frontend. - model.addAttribute("operation", operationResults.toJSONString()); - model.addAttribute("invocationId", String.format("Invocation: %s", invocationId)); - } - - // An operation represents an executed action. - // The operation has a target ID which corresponds to the bazel label what developers are - // typically thinking of when wishing to evaluate their build. - // This page shows them all of the information that we track related to the operation. - @GetMapping("/operation/{operationId}") - public String operation(Model model, @PathVariable String operationId) { - EnrichedOperation result = - instance.findEnrichedOperation(String.format("shard/operations/%s", operationId)); - model.addAttribute("fullOperation", result.asJsonString()); - return "operation"; - } - - // Information about the current deployment. Useful for verifying what is running. - @GetMapping("/info") - public String info(Model model) { - return "info"; - } - - /** - * @brief Convert string json into operation type. - * @details Parses json and returns null if invalid. - * @param json The json to convert to Operation type. - * @return The created operation. - * @note Suggested return identifier: operation. - */ - private static Operation jsonToOperation(String json) { - // create a json parser - JsonFormat.Parser operationParser = - JsonFormat.parser() - .usingTypeRegistry( - JsonFormat.TypeRegistry.newBuilder() - .add(CompletedOperationMetadata.getDescriptor()) - .add(ExecutingOperationMetadata.getDescriptor()) - .add(ExecuteOperationMetadata.getDescriptor()) - .add(QueuedOperationMetadata.getDescriptor()) - .add(PreconditionFailure.getDescriptor()) - .build()) - .ignoringUnknownFields(); - - if (json == null) { - log.log(Level.WARNING, "Operation Json is empty"); - return null; - } - try { - Operation.Builder operationBuilder = Operation.newBuilder(); - operationParser.merge(json, operationBuilder); - return operationBuilder.build(); - } catch (InvalidProtocolBufferException e) { - log.log(Level.WARNING, "InvalidProtocolBufferException while building an operation.", e); - return null; - } - } - - private String extractTargetId(Operation operation) { - return expectRequestMetadata(operation).getTargetId(); - } - - private String extractActionMnemonic(Operation operation) { - return expectRequestMetadata(operation).getActionMnemonic(); - } - - private String extractStatus(Operation operation) { - return String.valueOf(expectExecuteOperationMetadata(operation).getStage()); - } - - private String extractDuration(Operation operation) { - Any result = operation.getResponse(); - try { - Timestamp start = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerStartTimestamp(); - Timestamp end = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerCompletedTimestamp(); - Duration duration = Timestamps.between(start, end); - return Durations.toSecondsAsDouble(duration) + "s"; - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private String extractWorker(Operation operation) { - Any result = operation.getResponse(); - try { - return result.unpack(ExecuteResponse.class).getResult().getExecutionMetadata().getWorker(); - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private static ExecuteOperationMetadata expectExecuteOperationMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getExecuteOperationMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getExecuteOperationMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getExecuteOperationMetadata(); - } - return ExecuteOperationMetadata.getDefaultInstance(); - } - - private static RequestMetadata expectRequestMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getRequestMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getRequestMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getRequestMetadata(); - } - return RequestMetadata.getDefaultInstance(); - } - - private static QueuedOperationMetadata maybeQueuedOperationMetadata(String name, Any metadata) { - if (metadata.is(QueuedOperationMetadata.class)) { - try { - return metadata.unpack(QueuedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static ExecutingOperationMetadata maybeExecutingOperationMetadata( - String name, Any metadata) { - if (metadata.is(ExecutingOperationMetadata.class)) { - try { - return metadata.unpack(ExecutingOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static CompletedOperationMetadata maybeCompletedOperationMetadata( - String name, Any metadata) { - if (metadata.is(CompletedOperationMetadata.class)) { - try { - return metadata.unpack(CompletedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid completed operation metadata %s", name), e); - } - } - return null; - } -} From fa8dd6fa1c4d138e08333f07c048446aa15a9b33 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 14:54:06 -0400 Subject: [PATCH 07/58] Interrupt+Join operationQueuer/dispatchMonitor Use interrupt to halt the OperationQueuer. Join on both operationQueuer and dispatchMonitor before instance stop return. --- .../java/build/buildfarm/instance/shard/ShardInstance.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java index 883931d2d4..bf0c42a8c5 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java @@ -568,10 +568,12 @@ public void stop() throws InterruptedException { stopping = true; log.log(Level.FINER, format("Instance %s is stopping", getName())); if (operationQueuer != null) { - operationQueuer.stop(); + operationQueuer.interrupt(); + operationQueuer.join(); } if (dispatchedMonitor != null) { dispatchedMonitor.interrupt(); + dispatchedMonitor.join(); } if (prometheusMetricsThread != null) { prometheusMetricsThread.interrupt(); From 60093dbe25506302cf3f2c115f98378dc96629b7 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 15:33:45 -0400 Subject: [PATCH 08/58] Present operationNames by stage Include Match and Report Result stages in output Record the active operationName occupying slots in each of the stages and present them with WorkerProfile Avoid several unnecessary casts with interfaces for operation slot stages. --- src/main/java/build/buildfarm/tools/Cat.java | 3 + .../buildfarm/worker/ExecuteActionStage.java | 4 +- .../buildfarm/worker/InputFetchStage.java | 6 +- .../build/buildfarm/worker/MatchStage.java | 6 +- .../build/buildfarm/worker/PipelineStage.java | 35 ++++-- .../worker/SuperscalarPipelineStage.java | 39 ++++++- .../build/buildfarm/worker/shard/Worker.java | 21 ++-- .../worker/shard/WorkerProfileService.java | 100 +++++++++++------- .../build/buildfarm/v1test/buildfarm.proto | 2 + .../worker/SuperscalarPipelineStageTest.java | 5 + 10 files changed, 155 insertions(+), 66 deletions(-) diff --git a/src/main/java/build/buildfarm/tools/Cat.java b/src/main/java/build/buildfarm/tools/Cat.java index 7c8e561454..bca404b3c1 100644 --- a/src/main/java/build/buildfarm/tools/Cat.java +++ b/src/main/java/build/buildfarm/tools/Cat.java @@ -650,6 +650,9 @@ private static void getWorkerProfile(Instance instance) { private static void printStageInformation(StageInformation stage) { System.out.printf("%s slots configured: %d%n", stage.getName(), stage.getSlotsConfigured()); System.out.printf("%s slots used %d%n", stage.getName(), stage.getSlotsUsed()); + for (String operationName : stage.getOperationNamesList()) { + System.out.printf("%s operation %s\n", stage.getName(), operationName); + } } private static void printOperationTime(OperationTimesBetweenStages time) { diff --git a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java index 6c5e247a8f..f1ad546676 100644 --- a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java +++ b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java @@ -107,7 +107,7 @@ public void releaseExecutor( int slotUsage = removeAndRelease(operationName, claims); executionTime.observe(usecs / 1000.0); executionStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, @@ -141,7 +141,7 @@ protected void iterate() throws InterruptedException { executors.add(executorThread); int slotUsage = executorClaims.addAndGet(limits.cpu.claimed); executionSlotUsage.set(slotUsage); - logStart(operationContext.operation.getName(), getUsage(slotUsage)); + start(operationContext.operation.getName(), getUsage(slotUsage)); executorThread.start(); } } diff --git a/src/main/java/build/buildfarm/worker/InputFetchStage.java b/src/main/java/build/buildfarm/worker/InputFetchStage.java index 6b0f741fd8..3953ef9595 100644 --- a/src/main/java/build/buildfarm/worker/InputFetchStage.java +++ b/src/main/java/build/buildfarm/worker/InputFetchStage.java @@ -72,13 +72,14 @@ public void releaseInputFetcher( int size = removeAndRelease(operationName); inputFetchTime.observe(usecs / 1000.0); inputFetchStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, String.format("%s, %s", success ? "Success" : "Failure", getUsage(size))); } + @Override public int getSlotUsage() { return fetchers.size(); } @@ -106,8 +107,7 @@ protected void iterate() throws InterruptedException { fetchers.add(fetcher); int slotUsage = fetchers.size(); inputFetchSlotUsage.set(slotUsage); - logStart( - operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); + start(operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); fetcher.start(); } } diff --git a/src/main/java/build/buildfarm/worker/MatchStage.java b/src/main/java/build/buildfarm/worker/MatchStage.java index e245ee68d6..dc81a2ce31 100644 --- a/src/main/java/build/buildfarm/worker/MatchStage.java +++ b/src/main/java/build/buildfarm/worker/MatchStage.java @@ -103,12 +103,12 @@ public void setOnCancelHandler(Runnable onCancelHandler) { @SuppressWarnings("SameReturnValue") private boolean onOperationPolled() throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - logStart(operationName); + start(operationName); long matchingAtUSecs = stopwatch.elapsed(MICROSECONDS); OperationContext matchedOperationContext = match(operationContext); long matchedInUSecs = stopwatch.elapsed(MICROSECONDS) - matchingAtUSecs; - logComplete(operationName, matchedInUSecs, waitDuration, true); + complete(operationName, matchedInUSecs, waitDuration, true); matchedOperationContext.poller.pause(); try { output.put(matchedOperationContext); @@ -139,7 +139,7 @@ protected void iterate() throws InterruptedException { } MatchOperationListener listener = new MatchOperationListener(operationContext, stopwatch); try { - logStart(); + start(); workerContext.match(listener); } finally { if (!listener.wasMatched()) { diff --git a/src/main/java/build/buildfarm/worker/PipelineStage.java b/src/main/java/build/buildfarm/worker/PipelineStage.java index 7dd1d6d9ee..edad2bb572 100644 --- a/src/main/java/build/buildfarm/worker/PipelineStage.java +++ b/src/main/java/build/buildfarm/worker/PipelineStage.java @@ -19,6 +19,7 @@ import com.google.common.base.Stopwatch; import java.util.logging.Level; import java.util.logging.Logger; +import javax.annotation.Nullable; public abstract class PipelineStage implements Runnable { protected final String name; @@ -30,6 +31,7 @@ public abstract class PipelineStage implements Runnable { private volatile boolean closed = false; private Thread tickThread = null; private boolean tickCancelledFlag = false; + private String operationName = null; PipelineStage( String name, WorkerContext workerContext, PipelineStage output, PipelineStage error) { @@ -39,12 +41,20 @@ public abstract class PipelineStage implements Runnable { this.error = error; } + public String getName() { + return name; + } + private void runInterruptible() throws InterruptedException { while (!output.isClosed() || isClaimed()) { iterate(); } } + public @Nullable String getOperationName() { + return operationName; + } + @Override public void run() { try { @@ -94,7 +104,7 @@ protected void iterate() throws InterruptedException { Stopwatch stopwatch = Stopwatch.createUnstarted(); try { operationContext = take(); - logStart(operationContext.operation.getName()); + start(operationContext.operation.getName()); stopwatch.start(); boolean valid = false; tickThread = Thread.currentThread(); @@ -128,31 +138,34 @@ protected void iterate() throws InterruptedException { } after(operationContext); long usecs = stopwatch.elapsed(MICROSECONDS); - logComplete( - operationContext.operation.getName(), usecs, stallUSecs, nextOperationContext != null); + complete(operationName, usecs, stallUSecs, nextOperationContext != null); + operationName = null; } private String logIterateId(String operationName) { return String.format("%s::iterate(%s)", name, operationName); } - protected void logStart() { - logStart(""); + protected void start() { + start(""); } - protected void logStart(String operationName) { - logStart(operationName, "Starting"); + protected void start(String operationName) { + start(operationName, "Starting"); } - protected void logStart(String operationName, String message) { + protected void start(String operationName, String message) { + // TODO to unary stage + this.operationName = operationName; getLogger().log(Level.FINER, String.format("%s: %s", logIterateId(operationName), message)); } - protected void logComplete(String operationName, long usecs, long stallUSecs, boolean success) { - logComplete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); + protected void complete(String operationName, long usecs, long stallUSecs, boolean success) { + complete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); } - protected void logComplete(String operationName, long usecs, long stallUSecs, String status) { + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + this.operationName = operationName; getLogger() .log( Level.FINER, diff --git a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java index 64636d72c2..ed7e610a10 100644 --- a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java +++ b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java @@ -14,17 +14,21 @@ package build.buildfarm.worker; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; import java.util.logging.Level; -abstract class SuperscalarPipelineStage extends PipelineStage { +public abstract class SuperscalarPipelineStage extends PipelineStage { protected final int width; @SuppressWarnings("rawtypes") protected final BlockingQueue claims; + protected Set operationNames = new HashSet<>(); + private volatile boolean catastrophic = false; // ensure that only a single claim waits for available slots for core count @@ -46,6 +50,39 @@ public SuperscalarPipelineStage( protected abstract int claimsRequired(OperationContext operationContext); + @Override + public String getOperationName() { + throw new UnsupportedOperationException("use getOperationNames on superscalar stages"); + } + + public int getWidth() { + return width; + } + + public abstract int getSlotUsage(); + + public Iterable getOperationNames() { + synchronized (operationNames) { + return new HashSet(operationNames); + } + } + + @Override + protected void start(String operationName, String message) { + synchronized (operationNames) { + operationNames.add(operationName); + } + super.start(operationName, message); + } + + @Override + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + super.complete(operationName, usecs, stallUSecs, status); + synchronized (operationNames) { + operationNames.remove(operationName); + } + } + synchronized void waitForReleaseOrCatastrophe(BlockingQueue queue) { boolean interrupted = false; while (!catastrophic && isClaimed()) { diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 46f0f083f6..a5affbbaec 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -57,6 +57,7 @@ import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.ReportResultStage; +import build.buildfarm.worker.SuperscalarPipelineStage; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import com.google.devtools.common.options.OptionsParsingException; @@ -197,7 +198,7 @@ private Operation stripQueuedOperation(Operation operation) { private Server createServer( ServerBuilder serverBuilder, - ContentAddressableStorage storage, + @Nullable CASFileCache storage, Instance instance, Pipeline pipeline, ShardWorkerContext context) { @@ -211,13 +212,13 @@ private Server createServer( // It will use various execution phases for it's profile service. // On the other hand, a worker that is only capable of CAS storage does not need a pipeline. if (configs.getWorker().getCapabilities().isExecution()) { - PipelineStage completeStage = - new PutOperationStage((operation) -> context.deactivate(operation.getName())); + PutOperationStage completeStage = + new PutOperationStage(operation -> context.deactivate(operation.getName())); PipelineStage errorStage = completeStage; /* new ErrorStage(); */ PipelineStage reportResultStage = new ReportResultStage(context, completeStage, errorStage); - PipelineStage executeActionStage = + SuperscalarPipelineStage executeActionStage = new ExecuteActionStage(context, reportResultStage, errorStage); - PipelineStage inputFetchStage = + SuperscalarPipelineStage inputFetchStage = new InputFetchStage(context, executeActionStage, new PutOperationStage(context::requeue)); PipelineStage matchStage = new MatchStage(context, inputFetchStage, errorStage); @@ -228,7 +229,13 @@ private Server createServer( serverBuilder.addService( new WorkerProfileService( - storage, inputFetchStage, executeActionStage, context, completeStage, backplane)); + storage, + matchStage, + inputFetchStage, + executeActionStage, + reportResultStage, + completeStage, + backplane)); } GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, configs.getWorker().getGrpcMetrics()); serverBuilder.intercept(new ServerHeadersInterceptor()); @@ -608,7 +615,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep writer); pipeline = new Pipeline(); - server = createServer(serverBuilder, storage, instance, pipeline, context); + server = createServer(serverBuilder, (CASFileCache) storage, instance, pipeline, context); removeWorker(configs.getWorker().getPublicName()); diff --git a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java index 6eee990426..f00c57cc3d 100644 --- a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java @@ -15,7 +15,6 @@ package build.buildfarm.worker.shard; import build.buildfarm.backplane.Backplane; -import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.v1test.OperationTimesBetweenStages; import build.buildfarm.v1test.StageInformation; @@ -24,67 +23,90 @@ import build.buildfarm.v1test.WorkerProfileGrpc; import build.buildfarm.v1test.WorkerProfileMessage; import build.buildfarm.v1test.WorkerProfileRequest; -import build.buildfarm.worker.ExecuteActionStage; -import build.buildfarm.worker.InputFetchStage; import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.PutOperationStage.OperationStageDurations; -import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.SuperscalarPipelineStage; import io.grpc.stub.StreamObserver; import java.io.IOException; +import javax.annotation.Nullable; public class WorkerProfileService extends WorkerProfileGrpc.WorkerProfileImplBase { - private final CASFileCache storage; - private final InputFetchStage inputFetchStage; - private final ExecuteActionStage executeActionStage; - private final WorkerContext context; + private final @Nullable CASFileCache storage; + private final PipelineStage matchStage; + private final SuperscalarPipelineStage inputFetchStage; + private final SuperscalarPipelineStage executeActionStage; + private final PipelineStage reportResultStage; private final PutOperationStage completeStage; private final Backplane backplane; public WorkerProfileService( - ContentAddressableStorage storage, - PipelineStage inputFetchStage, - PipelineStage executeActionStage, - WorkerContext context, - PipelineStage completeStage, + @Nullable CASFileCache storage, + PipelineStage matchStage, + SuperscalarPipelineStage inputFetchStage, + SuperscalarPipelineStage executeActionStage, + PipelineStage reportResultStage, + PutOperationStage completeStage, Backplane backplane) { - this.storage = (CASFileCache) storage; - this.inputFetchStage = (InputFetchStage) inputFetchStage; - this.executeActionStage = (ExecuteActionStage) executeActionStage; - this.context = context; + this.storage = storage; + this.matchStage = matchStage; + this.inputFetchStage = inputFetchStage; + this.executeActionStage = executeActionStage; + this.reportResultStage = reportResultStage; this.completeStage = (PutOperationStage) completeStage; this.backplane = backplane; } + private StageInformation unaryStageInformation(String name, @Nullable String operationName) { + StageInformation.Builder builder = + StageInformation.newBuilder().setName(name).setSlotsConfigured(1); + if (operationName != null) { + builder.setSlotsUsed(1).addOperationNames(operationName); + } + return builder.build(); + } + + private StageInformation superscalarStageInformation(SuperscalarPipelineStage stage) { + return StageInformation.newBuilder() + .setName(stage.getName()) + .setSlotsConfigured(stage.getWidth()) + .setSlotsUsed(stage.getSlotUsage()) + .addAllOperationNames(stage.getOperationNames()) + .build(); + } + @Override public void getWorkerProfile( WorkerProfileRequest request, StreamObserver responseObserver) { // get usage of CASFileCache - WorkerProfileMessage.Builder replyBuilder = - WorkerProfileMessage.newBuilder() - .setCasSize(storage.size()) - .setCasEntryCount(storage.entryCount()) - .setCasMaxSize(storage.maxSize()) - .setCasMaxEntrySize(storage.maxEntrySize()) - .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) - .setCasDirectoryEntryCount(storage.directoryStorageCount()) - .setCasEvictedEntryCount(storage.getEvictedCount()) - .setCasEvictedEntrySize(storage.getEvictedSize()); + WorkerProfileMessage.Builder replyBuilder = WorkerProfileMessage.newBuilder(); + + // FIXME deliver full local storage chain + if (storage != null) { + replyBuilder + .setCasSize(storage.size()) + .setCasEntryCount(storage.entryCount()) + .setCasMaxSize(storage.maxSize()) + .setCasMaxEntrySize(storage.maxEntrySize()) + .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) + .setCasDirectoryEntryCount(storage.directoryStorageCount()) + .setCasEvictedEntryCount(storage.getEvictedCount()) + .setCasEvictedEntrySize(storage.getEvictedSize()); + } // get slots configured and used of superscalar stages + // prefer reverse order to avoid double counting if possible + // these stats are not consistent across their sampling and will + // produce: slots that are not consistent with operations, operations + // in multiple stages even in reverse due to claim progress + // in short: this is for monitoring, not for guaranteed consistency checks + String reportResultOperation = reportResultStage.getOperationName(); + String matchOperation = matchStage.getOperationName(); replyBuilder - .addStages( - StageInformation.newBuilder() - .setName("InputFetchStage") - .setSlotsConfigured(context.getInputFetchStageWidth()) - .setSlotsUsed(inputFetchStage.getSlotUsage()) - .build()) - .addStages( - StageInformation.newBuilder() - .setName("ExecuteActionStage") - .setSlotsConfigured(context.getExecuteStageWidth()) - .setSlotsUsed(executeActionStage.getSlotUsage()) - .build()); + .addStages(unaryStageInformation(reportResultStage.getName(), reportResultOperation)) + .addStages(superscalarStageInformation(executeActionStage)) + .addStages(superscalarStageInformation(inputFetchStage)) + .addStages(unaryStageInformation(matchStage.getName(), matchOperation)); // get average time costs on each stage OperationStageDurations[] durations = completeStage.getAverageTimeCostPerStage(); diff --git a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto index b4c78b3c03..9632593f11 100644 --- a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto +++ b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto @@ -602,6 +602,8 @@ message StageInformation { // number of slots used for this stage int32 slots_used = 3; + + repeated string operation_names = 4; } message WorkerProfileMessage { diff --git a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java index 60cb0d8be9..3716513102 100644 --- a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java +++ b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java @@ -72,6 +72,11 @@ protected int claimsRequired(OperationContext operationContext) { boolean isFull() { return claims.size() == width; } + + @Override + public int getSlotUsage() { + return 0; + } } @Test From 9ff8e6448f6ec4c67667b4fbb46c484318354bf4 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 26 Sep 2023 14:36:58 -0400 Subject: [PATCH 09/58] Remove subscribeToBackplane, adjust failsafe op A shard server is impractical without operation subscription, partition subscription confirmation between servers and workers. The failsafe execution is configuration that is likely not desired on workers. This change removes the failsafe behavior from workers via backplane config, and relegates the setting of failsafe boolean to server config. If the option is restored for workers, it can be added to worker configs so that configs may continue to be shared between workers and servers and retain independent addressability. --- examples/config.yml | 3 +- .../buildfarm/common/config/Backplane.java | 11 ++- .../build/buildfarm/common/config/Server.java | 1 + .../instance/shard/RedisShardBackplane.java | 20 ++++- .../instance/shard/ShardInstance.java | 6 +- .../build/buildfarm/worker/shard/Worker.java | 7 +- .../shard/RedisShardBackplaneTest.java | 83 +++++++++---------- 7 files changed, 79 insertions(+), 52 deletions(-) diff --git a/examples/config.yml b/examples/config.yml index 6ea0ee1132..f426e98547 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -22,6 +22,7 @@ server: dispatchedMonitorIntervalSeconds: 1 runOperationQueuer: true ensureOutputsPresent: false + runFailsafeOperation: true maxCpu: 0 maxRequeueAttempts: 5 useDenyList: true @@ -70,8 +71,6 @@ backplane: operationChannelPrefix: "OperationChannel" casPrefix: "ContentAddressableStorage" casExpire: 604800 # 1 week - subscribeToBackplane: true - runFailsafeOperation: true maxQueueDepth: 100000 maxPreQueueDepth: 1000000 priorityQueue: false diff --git a/src/main/java/build/buildfarm/common/config/Backplane.java b/src/main/java/build/buildfarm/common/config/Backplane.java index 9dd28c3d22..5c21532ab4 100644 --- a/src/main/java/build/buildfarm/common/config/Backplane.java +++ b/src/main/java/build/buildfarm/common/config/Backplane.java @@ -1,7 +1,9 @@ package build.buildfarm.common.config; import com.google.common.base.Strings; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Backplane { @@ -32,8 +34,13 @@ public enum BACKPLANE_TYPE { private String operationChannelPrefix = "OperationChannel"; private String casPrefix = "ContentAddressableStorage"; private int casExpire = 604800; // 1 Week - private boolean subscribeToBackplane = true; - private boolean runFailsafeOperation = true; + + @Getter(AccessLevel.NONE) + private boolean subscribeToBackplane = true; // deprecated + + @Getter(AccessLevel.NONE) + private boolean runFailsafeOperation = true; // deprecated + private int maxQueueDepth = 100000; private int maxPreQueueDepth = 1000000; private boolean priorityQueue = false; diff --git a/src/main/java/build/buildfarm/common/config/Server.java b/src/main/java/build/buildfarm/common/config/Server.java index e169c2575b..9f2899ff60 100644 --- a/src/main/java/build/buildfarm/common/config/Server.java +++ b/src/main/java/build/buildfarm/common/config/Server.java @@ -24,6 +24,7 @@ public enum INSTANCE_TYPE { private String sslPrivateKeyPath = null; private boolean runDispatchedMonitor = true; private int dispatchedMonitorIntervalSeconds = 1; + private boolean runFailsafeOperation = true; private boolean runOperationQueuer = true; private boolean ensureOutputsPresent = false; private int maxRequeueAttempts = 5; diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index 6676accdf3..ebb832f5aa 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -129,6 +129,8 @@ public class RedisShardBackplane implements Backplane { .build()); private final String source; // used in operation change publication + private final boolean subscribeToBackplane; + private final boolean runFailsafeOperation; private final Function onPublish; private final Function onComplete; private final Supplier jedisClusterFactory; @@ -149,18 +151,30 @@ public class RedisShardBackplane implements Backplane { public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete) throws ConfigurationException { - this(source, onPublish, onComplete, JedisClusterFactory.create(source)); + this( + source, + subscribeToBackplane, + runFailsafeOperation, + onPublish, + onComplete, + JedisClusterFactory.create(source)); } public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete, Supplier jedisClusterFactory) { this.source = source; + this.subscribeToBackplane = subscribeToBackplane; + this.runFailsafeOperation = runFailsafeOperation; this.onPublish = onPublish; this.onComplete = onComplete; this.jedisClusterFactory = jedisClusterFactory; @@ -519,10 +533,10 @@ public void start(String clientPublicName) throws IOException { // Create containers that make up the backplane state = DistributedStateCreator.create(client); - if (configs.getBackplane().isSubscribeToBackplane()) { + if (subscribeToBackplane) { startSubscriptionThread(); } - if (configs.getBackplane().isRunFailsafeOperation()) { + if (runFailsafeOperation) { startFailsafeOperationThread(); } diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java index bf0c42a8c5..c1c07b08f0 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java @@ -254,7 +254,11 @@ public class ShardInstance extends AbstractServerInstance { private static Backplane createBackplane(String identifier) throws ConfigurationException { if (configs.getBackplane().getType().equals(SHARD)) { return new RedisShardBackplane( - identifier, ShardInstance::stripOperation, ShardInstance::stripQueuedOperation); + identifier, + /* subscribeToBackplane=*/ true, + configs.getServer().isRunFailsafeOperation(), + ShardInstance::stripOperation, + ShardInstance::stripQueuedOperation); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); } diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index a5affbbaec..9a919d9703 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -546,7 +546,12 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep if (SHARD.equals(configs.getBackplane().getType())) { backplane = - new RedisShardBackplane(identifier, this::stripOperation, this::stripQueuedOperation); + new RedisShardBackplane( + identifier, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + this::stripOperation, + this::stripQueuedOperation); backplane.start(configs.getWorker().getPublicName()); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); diff --git a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java index 97d8fb5d0f..16e5bb8fbc 100644 --- a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java +++ b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java @@ -58,7 +58,6 @@ @RunWith(JUnit4.class) public class RedisShardBackplaneTest { - private RedisShardBackplane backplane; private BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); @Mock Supplier mockJedisClusterFactory; @@ -66,12 +65,20 @@ public class RedisShardBackplaneTest { @Before public void setUp() throws IOException { configs.getBackplane().setOperationExpire(10); - configs.getBackplane().setSubscribeToBackplane(false); - configs.getBackplane().setRunFailsafeOperation(false); configs.getBackplane().setQueues(new Queue[] {}); MockitoAnnotations.initMocks(this); } + public RedisShardBackplane createBackplane(String name) { + return new RedisShardBackplane( + name, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + o -> o, + o -> o, + mockJedisClusterFactory); + } + @Test public void workersWithInvalidProtobufAreRemoved() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); @@ -80,9 +87,7 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { .thenReturn(ImmutableMap.of("foo", "foo")); when(jedisCluster.hdel(configs.getBackplane().getWorkersHashName() + "_storage", "foo")) .thenReturn(1L); - backplane = - new RedisShardBackplane( - "invalid-protobuf-worker-removed-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invalid-protobuf-worker-removed-test"); backplane.start("startTime/test:0000"); assertThat(backplane.getStorageWorkers()).isEmpty(); @@ -99,12 +104,10 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { assertThat(workerChange.getTypeCase()).isEqualTo(WorkerChange.TypeCase.REMOVE); } - void verifyChangePublished(JedisCluster jedis) throws IOException { + OperationChange verifyChangePublished(String channel, JedisCluster jedis) throws IOException { ArgumentCaptor changeCaptor = ArgumentCaptor.forClass(String.class); - verify(jedis, times(1)).publish(eq(backplane.operationChannel("op")), changeCaptor.capture()); - OperationChange opChange = parseOperationChange(changeCaptor.getValue()); - assertThat(opChange.hasReset()).isTrue(); - assertThat(opChange.getReset().getOperation().getName()).isEqualTo("op"); + verify(jedis, times(1)).publish(eq(channel), changeCaptor.capture()); + return parseOperationChange(changeCaptor.getValue()); } String operationName(String name) { @@ -115,9 +118,7 @@ String operationName(String name) { public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "prequeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("prequeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -135,32 +136,34 @@ public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getPreQueuedOperationsListName(), JsonFormat.printer().print(executeEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void queuingPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; backplane.queueing(opName); verify(mockJedisClusterFactory, times(1)).get(); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -180,7 +183,10 @@ public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getQueuedOperationsListName(), JsonFormat.printer().print(queueEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -194,9 +200,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // ARRANGE @@ -250,9 +254,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // Assume the operation queue is already populated from a first re-queue. @@ -298,9 +300,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() public void completeOperationUndispatches() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "complete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("complete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -317,9 +317,7 @@ public void completeOperationUndispatches() throws IOException { public void deleteOperationDeletesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "delete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("delete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -330,7 +328,10 @@ public void deleteOperationDeletesAndPublishes() throws IOException { verify(jedisCluster, times(1)) .hdel(configs.getBackplane().getDispatchedOperationsHashName(), opName); verify(jedisCluster, times(1)).del(operationName(opName)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -341,9 +342,7 @@ public void invocationsCanBeBlacklisted() throws IOException { configs.getBackplane().getInvocationBlacklistPrefix() + ":" + toolInvocationId; when(jedisCluster.exists(invocationBlacklistKey)).thenReturn(true); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "invocation-blacklist-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invocation-blacklist-test"); backplane.start("startTime/test:0000"); assertThat( @@ -361,8 +360,7 @@ public void invocationsCanBeBlacklisted() throws IOException { public void testGetWorkersStartTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("workers-starttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("workers-starttime-test"); backplane.start("startTime/test:0000"); Set workerNames = ImmutableSet.of("worker1", "worker2", "missing_worker"); @@ -386,8 +384,7 @@ public void testGetWorkersStartTime() throws IOException { public void getDigestInsertTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("startTime/test:0000"); long ttl = 3600L; long expirationInSecs = configs.getBackplane().getCasExpire(); From b21ab2e93a56c74c8071133d4d68675c3527c1b3 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Thu, 28 Sep 2023 16:08:23 -0400 Subject: [PATCH 10/58] Removing AWS/GCP Metrics and Admin controls Internally driven metrics and scaling controls have low, if any, usage rates. Prometheus has largely succeeded independent publication of metrics, and externally driven scaling is the norm. These modules have been incomplete between cloud providers, and for the functional side of AWS, bind us to springboot. Removing them for the sake of reduced dependencies and complexity. --- defs.bzl | 5 - .../java/build/buildfarm/admin/Admin.java | 36 --- src/main/java/build/buildfarm/admin/BUILD | 15 -- .../build/buildfarm/admin/aws/AwsAdmin.java | 243 ----------------- src/main/java/build/buildfarm/admin/aws/BUILD | 29 -- src/main/java/build/buildfarm/admin/gcp/BUILD | 18 -- .../build/buildfarm/admin/gcp/GcpAdmin.java | 57 ---- .../buildfarm/common/config/Metrics.java | 6 +- .../metrics/aws/AwsMetricsPublisher.java | 151 ----------- .../java/build/buildfarm/metrics/aws/BUILD | 22 -- .../java/build/buildfarm/metrics/gcp/BUILD | 16 -- .../metrics/gcp/GcpMetricsPublisher.java | 31 --- .../buildfarm/server/BuildFarmServer.java | 2 - .../server/services/AdminService.java | 254 ------------------ .../build/buildfarm/server/services/BUILD | 5 - .../server/services/ExecutionService.java | 11 +- .../java/build/buildfarm/worker/shard/BUILD | 2 - src/test/java/build/buildfarm/metrics/BUILD | 4 - .../metrics/MetricsPublisherTest.java | 11 +- 19 files changed, 11 insertions(+), 907 deletions(-) delete mode 100644 src/main/java/build/buildfarm/admin/Admin.java delete mode 100644 src/main/java/build/buildfarm/admin/BUILD delete mode 100644 src/main/java/build/buildfarm/admin/aws/AwsAdmin.java delete mode 100644 src/main/java/build/buildfarm/admin/aws/BUILD delete mode 100644 src/main/java/build/buildfarm/admin/gcp/BUILD delete mode 100644 src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java delete mode 100644 src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java delete mode 100644 src/main/java/build/buildfarm/metrics/aws/BUILD delete mode 100644 src/main/java/build/buildfarm/metrics/gcp/BUILD delete mode 100644 src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java delete mode 100644 src/main/java/build/buildfarm/server/services/AdminService.java diff --git a/defs.bzl b/defs.bzl index 5108852adb..993e9d36f4 100644 --- a/defs.bzl +++ b/defs.bzl @@ -43,12 +43,7 @@ IO_GRPC_MODULES = [ ] COM_AWS_MODULES = [ - "autoscaling", - "core", - "ec2", "secretsmanager", - "sns", - "ssm", "s3", ] diff --git a/src/main/java/build/buildfarm/admin/Admin.java b/src/main/java/build/buildfarm/admin/Admin.java deleted file mode 100644 index ca739f2347..0000000000 --- a/src/main/java/build/buildfarm/admin/Admin.java +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin; - -import build.buildfarm.v1test.GetHostsResult; - -public interface Admin { - void terminateHost(String hostId); - - void stopContainer(String hostId, String containerName); - - GetHostsResult getHosts(String filter, int ageInMinutes, String status); - - void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent); - - void disableHostScaleInProtection(String instanceName); - - void disableHostScaleInProtection(String clusterEndpoint, String instanceIp); -} diff --git a/src/main/java/build/buildfarm/admin/BUILD b/src/main/java/build/buildfarm/admin/BUILD deleted file mode 100644 index a5e481399e..0000000000 --- a/src/main/java/build/buildfarm/admin/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -java_library( - name = "admin", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java b/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java deleted file mode 100644 index f40dd72c95..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.aws; - -import static build.buildfarm.common.grpc.Channels.createChannel; - -import build.buildfarm.admin.Admin; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.Host; -import com.amazonaws.services.autoscaling.AmazonAutoScaling; -import com.amazonaws.services.autoscaling.AmazonAutoScalingClientBuilder; -import com.amazonaws.services.autoscaling.model.InstancesDistribution; -import com.amazonaws.services.autoscaling.model.MixedInstancesPolicy; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionRequest; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionResult; -import com.amazonaws.services.autoscaling.model.UpdateAutoScalingGroupRequest; -import com.amazonaws.services.ec2.AmazonEC2; -import com.amazonaws.services.ec2.AmazonEC2ClientBuilder; -import com.amazonaws.services.ec2.model.DescribeInstancesRequest; -import com.amazonaws.services.ec2.model.DescribeInstancesResult; -import com.amazonaws.services.ec2.model.Filter; -import com.amazonaws.services.ec2.model.Instance; -import com.amazonaws.services.ec2.model.Reservation; -import com.amazonaws.services.ec2.model.Tag; -import com.amazonaws.services.ec2.model.TerminateInstancesRequest; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagement; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagementClientBuilder; -import com.amazonaws.services.simplesystemsmanagement.model.SendCommandRequest; -import com.google.protobuf.util.Timestamps; -import io.grpc.ManagedChannel; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.springframework.stereotype.Component; - -@Log -@Component -public class AwsAdmin implements Admin { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private AmazonAutoScaling scale; - private AmazonEC2 ec2; - private AWSSimpleSystemsManagement ssm; - - public AwsAdmin() { - String region = configs.getServer().getCloudRegion(); - if (region != null) { - scale = AmazonAutoScalingClientBuilder.standard().withRegion(region).build(); - ec2 = AmazonEC2ClientBuilder.standard().withRegion(region).build(); - ssm = AWSSimpleSystemsManagementClientBuilder.standard().withRegion(region).build(); - } else { - log.warning("Missing cloudRegion configuration. AWS Admin will not be enabled."); - } - } - - @Override - public void terminateHost(String hostId) { - ec2.terminateInstances(new TerminateInstancesRequest().withInstanceIds(hostId)); - log.log(Level.INFO, String.format("Terminated host: %s", hostId)); - } - - @Override - public void stopContainer(String hostId, String containerName) { - String stopContainerCmd = - "docker ps | grep " + containerName + " | awk '{print $1 }' | xargs -I {} docker stop {}"; - Map> parameters = new HashMap<>(); - parameters.put("commands", Collections.singletonList(stopContainerCmd)); - ssm.sendCommand( - new SendCommandRequest() - .withDocumentName("AWS-RunShellScript") - .withInstanceIds(hostId) - .withParameters(parameters)); - log.log(Level.INFO, String.format("Stopped container: %s on host: %s", containerName, hostId)); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - GetHostsResult.Builder resultBuilder = GetHostsResult.newBuilder(); - List hosts = new ArrayList<>(); - DescribeInstancesResult instancesResult = - ec2.describeInstances( - new DescribeInstancesRequest() - .withFilters(new Filter().withName("tag-value").withValues(filter))); - long hostNum = 1L; - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - long uptime = getHostUptimeInMinutes(e.getLaunchTime()); - if (e.getPrivateIpAddress() != null - && uptime > ageInMinutes - && status.equalsIgnoreCase(e.getState().getName())) { - Host.Builder hostBuilder = Host.newBuilder(); - hostBuilder.setHostNum(hostNum++); - hostBuilder.setDnsName(e.getPrivateDnsName()); - hostBuilder.setHostId(e.getInstanceId()); - hostBuilder.setIpAddress(e.getPrivateIpAddress()); - hostBuilder.setLaunchTime(Timestamps.fromMillis(e.getLaunchTime().getTime())); - hostBuilder.setLifecycle( - e.getInstanceLifecycle() != null ? e.getInstanceLifecycle() : "on demand"); - hostBuilder.setNumCores(e.getCpuOptions().getCoreCount()); - hostBuilder.setState(e.getState().getName()); - hostBuilder.setType(e.getInstanceType()); - hostBuilder.setUptimeMinutes(uptime); - hosts.add(hostBuilder.build()); - } - } - } - resultBuilder.addAllHosts(hosts); - resultBuilder.setNumHosts(hosts.size()); - log.log(Level.FINER, String.format("Got %d hosts for filter: %s", hosts.size(), filter)); - return resultBuilder.build(); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - UpdateAutoScalingGroupRequest request = - new UpdateAutoScalingGroupRequest().withAutoScalingGroupName(scaleGroupName); - if (minHosts != null) { - request.setMinSize(minHosts); - } - if (maxHosts != null) { - request.setMaxSize(maxHosts); - } - if (targetHosts != null) { - request.setMaxSize(targetHosts); - } - if (targetReservedHostsPercent != null) { - request.setMixedInstancesPolicy( - new MixedInstancesPolicy() - .withInstancesDistribution( - new InstancesDistribution() - .withOnDemandPercentageAboveBaseCapacity(targetReservedHostsPercent))); - } - scale.updateAutoScalingGroup(request); - log.log(Level.INFO, String.format("Scaled: %s", scaleGroupName)); - } - - private long getHostUptimeInMinutes(Date launchTime) { - Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); - return (cal.getTime().getTime() - launchTime.getTime()) / 60000; - } - - /** - * Disable instance scale in protection so that auto scaler can shutdown the instance. - * - * @param privateDnsName the private Dns name of instance (i.e. ip-xx-xxx-xx-xx.ec2.internal) - */ - @Override - public void disableHostScaleInProtection(String privateDnsName) { - // 1 get AutoScalingGroup and InstanceId - Instance workerInstance = getInstanceId(privateDnsName); - if (workerInstance == null) { - String errorMessage = "Cannot find instance with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - String instanceId = workerInstance.getInstanceId(); - String autoScalingGroup = getTagValue(workerInstance.getTags()); - if (autoScalingGroup == null || autoScalingGroup.length() == 0) { - String errorMessage = - "Cannot find AutoScalingGroup name of worker with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - - // 2 disable scale in protection of the worker - SetInstanceProtectionRequest disableProtectionRequest = - new SetInstanceProtectionRequest() - .withInstanceIds(instanceId) - .withAutoScalingGroupName(autoScalingGroup) - .withProtectedFromScaleIn(false); - SetInstanceProtectionResult result = scale.setInstanceProtection(disableProtectionRequest); - log.log( - Level.INFO, - String.format( - "Disable protection of host: %s in AutoScalingGroup: %s and get result: %s", - instanceId, autoScalingGroup, result.toString())); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - ManagedChannel channel = null; - try { - channel = createChannel(clusterEndpoint); - AdminGrpc.AdminBlockingStub adminBlockingStub = AdminGrpc.newBlockingStub(channel); - adminBlockingStub.disableScaleInProtection( - DisableScaleInProtectionRequest.newBuilder().setInstanceName(instanceIp).build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - private String getTagValue(List tags) { - for (Tag tag : tags) { - if ("aws:autoscaling:groupName".equalsIgnoreCase(tag.getKey())) { - return tag.getValue(); - } - } - return null; - } - - private Instance getInstanceId(String privateDnsName) { - DescribeInstancesRequest describeInstancesRequest = - new DescribeInstancesRequest() - .withFilters(new Filter().withName("private-dns-name").withValues(privateDnsName)); - DescribeInstancesResult instancesResult = ec2.describeInstances(describeInstancesRequest); - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - if (e.getPrivateDnsName() != null && e.getPrivateDnsName().equals(privateDnsName)) { - return e; - } - } - } - return null; - } -} diff --git a/src/main/java/build/buildfarm/admin/aws/BUILD b/src/main/java/build/buildfarm/admin/aws/BUILD deleted file mode 100644 index a49e431bfd..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/common/grpc", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_autoscaling", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_ec2", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_ssm", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:io_grpc_grpc_api", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/BUILD b/src/main/java/build/buildfarm/admin/gcp/BUILD deleted file mode 100644 index 3d94b91f3f..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java b/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java deleted file mode 100644 index 34ee9a0163..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.gcp; - -import build.buildfarm.admin.Admin; -import build.buildfarm.v1test.GetHostsResult; -import org.springframework.stereotype.Component; - -@Component -public class GcpAdmin implements Admin { - @Override - public void terminateHost(String hostId) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void stopContainer(String hostId, String containerName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String instanceName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - throw new UnsupportedOperationException("Not Implemented"); - } -} diff --git a/src/main/java/build/buildfarm/common/config/Metrics.java b/src/main/java/build/buildfarm/common/config/Metrics.java index 73a9113d7a..28e065ddbf 100644 --- a/src/main/java/build/buildfarm/common/config/Metrics.java +++ b/src/main/java/build/buildfarm/common/config/Metrics.java @@ -1,6 +1,8 @@ package build.buildfarm.common.config; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Metrics { @@ -19,7 +21,9 @@ public enum LOG_LEVEL { FINEST, } - private PUBLISHER publisher = PUBLISHER.LOG; + @Getter(AccessLevel.NONE) + private PUBLISHER publisher = PUBLISHER.LOG; // deprecated + private LOG_LEVEL logLevel = LOG_LEVEL.FINEST; private String topic; private int topicMaxConnections; diff --git a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java deleted file mode 100644 index e0ae2785e0..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.aws; - -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; -import com.amazonaws.ClientConfiguration; -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.handlers.AsyncHandler; -import com.amazonaws.services.secretsmanager.AWSSecretsManager; -import com.amazonaws.services.secretsmanager.AWSSecretsManagerClientBuilder; -import com.amazonaws.services.secretsmanager.model.GetSecretValueRequest; -import com.amazonaws.services.secretsmanager.model.GetSecretValueResult; -import com.amazonaws.services.sns.AmazonSNSAsync; -import com.amazonaws.services.sns.AmazonSNSAsyncClientBuilder; -import com.amazonaws.services.sns.model.PublishRequest; -import com.amazonaws.services.sns.model.PublishResult; -import com.amazonaws.util.StringUtils; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.longrunning.Operation; -import java.io.IOException; -import java.util.Base64; -import java.util.HashMap; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AwsMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static AmazonSNSAsync snsClient; - - private final String snsTopicOperations; - private String accessKeyId = null; - private String secretKey = null; - private final String region; - private final int snsClientMaxConnections; - - public AwsMetricsPublisher() { - super(configs.getServer().getClusterId()); - snsTopicOperations = configs.getServer().getMetrics().getTopic(); - region = configs.getServer().getCloudRegion(); - getAwsSecret(configs.getServer().getMetrics().getSecretName()); - snsClientMaxConnections = configs.getServer().getMetrics().getTopicMaxConnections(); - if (!StringUtils.isNullOrEmpty(snsTopicOperations) - && snsClientMaxConnections > 0 - && !StringUtils.isNullOrEmpty(accessKeyId) - && !StringUtils.isNullOrEmpty(secretKey) - && !StringUtils.isNullOrEmpty(region)) { - snsClient = initSnsClient(); - } - } - - @Override - public void publishRequestMetadata(Operation operation, RequestMetadata requestMetadata) { - try { - if (snsClient != null) { - snsClient.publishAsync( - new PublishRequest( - snsTopicOperations, - formatRequestMetadataToJson(populateRequestMetadata(operation, requestMetadata))), - new AsyncHandler() { - @Override - public void onError(Exception e) { - log.log(Level.WARNING, "Could not publish metrics data to SNS.", e); - } - - @Override - public void onSuccess(PublishRequest request, PublishResult publishResult) {} - }); - } - } catch (Exception e) { - log.log( - Level.WARNING, - String.format("Could not publish request metadata to SNS for %s.", operation.getName()), - e); - } - } - - private AmazonSNSAsync initSnsClient() { - log.log(Level.INFO, "Initializing SNS Client."); - return AmazonSNSAsyncClientBuilder.standard() - .withRegion(region) - .withClientConfiguration( - new ClientConfiguration().withMaxConnections(snsClientMaxConnections)) - .withCredentials( - new AWSStaticCredentialsProvider( - new AWSCredentials() { - @Override - public String getAWSAccessKeyId() { - return accessKeyId; - } - - @Override - public String getAWSSecretKey() { - return secretKey; - } - })) - .build(); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } - - @SuppressWarnings("unchecked") - private void getAwsSecret(String secretName) { - AWSSecretsManager client = AWSSecretsManagerClientBuilder.standard().withRegion(region).build(); - GetSecretValueRequest getSecretValueRequest = - new GetSecretValueRequest().withSecretId(secretName); - GetSecretValueResult getSecretValueResult; - try { - getSecretValueResult = client.getSecretValue(getSecretValueRequest); - } catch (Exception e) { - log.log(Level.SEVERE, String.format("Could not get secret %s from AWS.", secretName)); - return; - } - String secret; - if (getSecretValueResult.getSecretString() != null) { - secret = getSecretValueResult.getSecretString(); - } else { - secret = - new String(Base64.getDecoder().decode(getSecretValueResult.getSecretBinary()).array()); - } - - if (secret != null) { - try { - final ObjectMapper objectMapper = new ObjectMapper(); - final HashMap secretMap = objectMapper.readValue(secret, HashMap.class); - accessKeyId = secretMap.get("access_key"); - secretKey = secretMap.get("secret_key"); - } catch (IOException e) { - log.log(Level.SEVERE, String.format("Could not parse secret %s from AWS", secretName)); - } - } - } -} diff --git a/src/main/java/build/buildfarm/metrics/aws/BUILD b/src/main/java/build/buildfarm/metrics/aws/BUILD deleted file mode 100644 index 51d44ea8c9..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", - "@maven//:com_fasterxml_jackson_core_jackson_databind", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_projectlombok_lombok", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/BUILD b/src/main/java/build/buildfarm/metrics/gcp/BUILD deleted file mode 100644 index 765902b905..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java deleted file mode 100644 index 40a6ddb1f8..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.gcp; - -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; - -public class GcpMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public GcpMetricsPublisher() { - super(configs.getServer().getClusterId()); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } -} diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index e82928e934..4ca669d5f1 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -30,7 +30,6 @@ import build.buildfarm.instance.shard.ShardInstance; import build.buildfarm.metrics.prometheus.PrometheusPublisher; import build.buildfarm.server.services.ActionCacheService; -import build.buildfarm.server.services.AdminService; import build.buildfarm.server.services.CapabilitiesService; import build.buildfarm.server.services.ExecutionService; import build.buildfarm.server.services.FetchService; @@ -122,7 +121,6 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName .addService(new ExecutionService(instance, keepaliveScheduler)) .addService(new OperationQueueService(instance)) .addService(new OperationsService(instance)) - .addService(new AdminService(instance)) .addService(new FetchService(instance)) .addService(ProtoReflectionService.newInstance()) .addService(new PublishBuildEventService()) diff --git a/src/main/java/build/buildfarm/server/services/AdminService.java b/src/main/java/build/buildfarm/server/services/AdminService.java deleted file mode 100644 index 94178fbf27..0000000000 --- a/src/main/java/build/buildfarm/server/services/AdminService.java +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.services; - -import static build.buildfarm.common.grpc.Channels.createChannel; - -import build.buildfarm.admin.Admin; -import build.buildfarm.admin.aws.AwsAdmin; -import build.buildfarm.admin.gcp.GcpAdmin; -import build.buildfarm.common.CasIndexResults; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.instance.Instance; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.DisableScaleInProtectionRequestResults; -import build.buildfarm.v1test.GetClientStartTimeRequest; -import build.buildfarm.v1test.GetClientStartTimeResult; -import build.buildfarm.v1test.GetHostsRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; -import build.buildfarm.v1test.ReindexCasRequest; -import build.buildfarm.v1test.ReindexCasRequestResults; -import build.buildfarm.v1test.ScaleClusterRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequestResults; -import build.buildfarm.v1test.ShutDownWorkerGrpc; -import build.buildfarm.v1test.StopContainerRequest; -import build.buildfarm.v1test.TerminateHostRequest; -import com.google.rpc.Code; -import com.google.rpc.Status; -import io.grpc.ManagedChannel; -import io.grpc.stub.StreamObserver; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AdminService extends AdminGrpc.AdminImplBase { - private final Admin adminController; - private final Instance instance; - - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public AdminService(Instance instance) { - this.adminController = getAdminController(); - this.instance = instance; - } - - @Override - public void terminateHost(TerminateHostRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.terminateHost(request.getHostId()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not terminate host.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void stopContainer(StopContainerRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.stopContainer(request.getHostId(), request.getContainerName()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not stop container.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getHosts(GetHostsRequest request, StreamObserver responseObserver) { - try { - GetHostsResult result = null; - if (adminController != null) { - result = - adminController.getHosts( - request.getFilter(), request.getAgeInMinutes(), request.getStatus()); - } - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not get hosts.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getClientStartTime( - GetClientStartTimeRequest request, - StreamObserver responseObserver) { - try { - GetClientStartTimeResult result = instance.getClientStartTime(request); - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log( - Level.SEVERE, - String.format("Could not get client start time for %s.", request.getInstanceName()), - e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void scaleCluster(ScaleClusterRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.scaleCluster( - request.getScaleGroupName(), - request.getMinHosts(), - request.getMaxHosts(), - request.getTargetHosts(), - request.getTargetReservedHostsPercent()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not scale cluster.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void reindexCas( - ReindexCasRequest request, StreamObserver responseObserver) { - try { - CasIndexResults results = instance.reindexCas(); - log.info(String.format("CAS Indexer Results: %s", results.toMessage())); - responseObserver.onNext( - ReindexCasRequestResults.newBuilder() - .setRemovedHosts(results.removedHosts) - .setRemovedKeys(results.removedKeys) - .setTotalKeys(results.totalKeys) - .build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not reindex CAS.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - /** - * Server-side implementation of ShutDownWorkerGracefully. This will reroute the request to target - * worker. - * - * @param request ShutDownWorkerGracefullyRequest received through grpc - * @param responseObserver grpc response observer - */ - @Override - public void shutDownWorkerGracefully( - ShutDownWorkerGracefullyRequest request, - StreamObserver responseObserver) { - try { - informWorkerToPrepareForShutdown(request.getWorkerName()); - responseObserver.onNext(ShutDownWorkerGracefullyRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (Exception e) { - String errorMessage = - String.format( - "Could not inform the worker %s to prepare for graceful shutdown with error %s.", - request.getWorkerName(), e.getMessage()); - log.log(Level.SEVERE, errorMessage); - responseObserver.onError(new Exception(errorMessage)); - } - } - - /** - * Inform a worker to prepare for graceful shutdown. - * - * @param host the host that should be prepared for shutdown. - */ - @SuppressWarnings("ResultOfMethodCallIgnored") - private void informWorkerToPrepareForShutdown(String host) { - ManagedChannel channel = null; - try { - channel = createChannel(host); - ShutDownWorkerGrpc.ShutDownWorkerBlockingStub shutDownWorkerBlockingStub = - ShutDownWorkerGrpc.newBlockingStub(channel); - shutDownWorkerBlockingStub.prepareWorkerForGracefulShutdown( - PrepareWorkerForGracefulShutDownRequest.newBuilder().build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - /** - * Server-side implementation of disableScaleInProtection. - * - * @param request grpc request - * @param responseObserver grpc response observer - */ - @Override - public void disableScaleInProtection( - DisableScaleInProtectionRequest request, - StreamObserver responseObserver) { - try { - String hostPrivateIp = trimHostPrivateDns(request.getInstanceName()); - adminController.disableHostScaleInProtection(hostPrivateIp); - responseObserver.onNext(DisableScaleInProtectionRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (RuntimeException e) { - responseObserver.onError(e); - } - } - - /** - * The private dns get from worker might be suffixed with ":portNumber", which should be trimmed. - * - * @param hostPrivateIp the private dns should be trimmed. - * @return - */ - @SuppressWarnings("JavaDoc") - private String trimHostPrivateDns(String hostPrivateIp) { - String portSeparator = ":"; - if (hostPrivateIp.contains(portSeparator)) { - hostPrivateIp = hostPrivateIp.split(portSeparator)[0]; - } - return hostPrivateIp; - } - - private static Admin getAdminController() { - if (configs.getServer().getAdmin().getDeploymentEnvironment() == null) { - return null; - } - switch (configs.getServer().getAdmin().getDeploymentEnvironment()) { - default: - return null; - case AWS: - return new AwsAdmin(); - case GCP: - return new GcpAdmin(); - } - } -} diff --git a/src/main/java/build/buildfarm/server/services/BUILD b/src/main/java/build/buildfarm/server/services/BUILD index 54b7af993c..6bb44b94f8 100644 --- a/src/main/java/build/buildfarm/server/services/BUILD +++ b/src/main/java/build/buildfarm/server/services/BUILD @@ -4,17 +4,12 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", - "//src/main/java/build/buildfarm/admin/gcp", "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/common/resources:resource_java_proto", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", - "//src/main/java/build/buildfarm/metrics/gcp", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", diff --git a/src/main/java/build/buildfarm/server/services/ExecutionService.java b/src/main/java/build/buildfarm/server/services/ExecutionService.java index 7f46c0bb3c..69799524fb 100644 --- a/src/main/java/build/buildfarm/server/services/ExecutionService.java +++ b/src/main/java/build/buildfarm/server/services/ExecutionService.java @@ -28,8 +28,6 @@ import build.buildfarm.common.grpc.TracingMetadataUtils; import build.buildfarm.instance.Instance; import build.buildfarm.metrics.MetricsPublisher; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; -import build.buildfarm.metrics.gcp.GcpMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.ListenableFuture; @@ -205,13 +203,6 @@ public void execute(ExecuteRequest request, StreamObserver responseOb } private static MetricsPublisher getMetricsPublisher() { - switch (configs.getServer().getMetrics().getPublisher()) { - default: - return new LogMetricsPublisher(); - case AWS: - return new AwsMetricsPublisher(); - case GCP: - return new GcpMetricsPublisher(); - } + return new LogMetricsPublisher(); } } diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index beeaaae918..120365882b 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -4,8 +4,6 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", "//src/main/java/build/buildfarm/backplane", "//src/main/java/build/buildfarm/cas", "//src/main/java/build/buildfarm/common", diff --git a/src/test/java/build/buildfarm/metrics/BUILD b/src/test/java/build/buildfarm/metrics/BUILD index 8e5d3cace9..c32955e20d 100644 --- a/src/test/java/build/buildfarm/metrics/BUILD +++ b/src/test/java/build/buildfarm/metrics/BUILD @@ -8,14 +8,10 @@ java_test( "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", "@googleapis//:google_rpc_error_details_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", "@maven//:com_google_guava_guava", "@maven//:com_google_jimfs_jimfs", "@maven//:com_google_protobuf_protobuf_java", diff --git a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java index 21c9685efd..f30adb55c6 100644 --- a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java +++ b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java @@ -22,7 +22,6 @@ import build.bazel.remote.execution.v2.RequestMetadata; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Metrics; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import build.buildfarm.v1test.OperationRequestMetadata; import com.google.longrunning.Operation; @@ -69,7 +68,7 @@ public class MetricsPublisherTest { public void setUp() throws IOException { configs.getServer().setCloudRegion("test"); configs.getServer().setClusterId("buildfarm-test"); - configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.AWS); + configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.LOG); } @Test @@ -81,7 +80,7 @@ public void publishCompleteMetricsTest() throws InvalidProtocolBufferException { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - AwsMetricsPublisher metricsPublisher = new AwsMetricsPublisher(); + LogMetricsPublisher metricsPublisher = new LogMetricsPublisher(); assertThat( AbstractMetricsPublisher.formatRequestMetadataToJson( metricsPublisher.populateRequestMetadata(operation, defaultRequestMetadata))) @@ -110,7 +109,7 @@ public void publishMetricsWithNoExecuteResponseTest() { Operation operation = defaultOperation.toBuilder().setMetadata(Any.pack(defaultExecuteOperationMetadata)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -119,7 +118,7 @@ public void publishMetricsWithNoExecuteOperationMetadataTest() { Operation operation = defaultOperation.toBuilder().setResponse(Any.pack(defaultExecuteResponse)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -135,7 +134,7 @@ public void preconditionFailureTest() { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } From f97f04816119775bcd52a9becd75c405450c52f4 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:00:15 -0400 Subject: [PATCH 11/58] Remove unused setOnCancelHandler Remove this unused OperationQueue feature which provides no invocations on any use. --- .../buildfarm/instance/MatchListener.java | 3 --- .../server/services/OperationQueueService.java | 18 ++---------------- .../build/buildfarm/worker/MatchStage.java | 5 ----- .../worker/shard/ShardWorkerContext.java | 5 ----- 4 files changed, 2 insertions(+), 29 deletions(-) diff --git a/src/main/java/build/buildfarm/instance/MatchListener.java b/src/main/java/build/buildfarm/instance/MatchListener.java index 46f6dfc000..3da7df90eb 100644 --- a/src/main/java/build/buildfarm/instance/MatchListener.java +++ b/src/main/java/build/buildfarm/instance/MatchListener.java @@ -27,7 +27,4 @@ public interface MatchListener { boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException; void onError(Throwable t); - - // method that should be called when this match is cancelled and no longer valid - void setOnCancelHandler(Runnable onCancelHandler); } diff --git a/src/main/java/build/buildfarm/server/services/OperationQueueService.java b/src/main/java/build/buildfarm/server/services/OperationQueueService.java index 6d3edb8178..11f6501667 100644 --- a/src/main/java/build/buildfarm/server/services/OperationQueueService.java +++ b/src/main/java/build/buildfarm/server/services/OperationQueueService.java @@ -28,9 +28,7 @@ import com.google.rpc.Code; import io.grpc.Status; import io.grpc.StatusRuntimeException; -import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; -import java.util.function.Consumer; public class OperationQueueService extends OperationQueueGrpc.OperationQueueImplBase { private final Instance instance; @@ -43,14 +41,11 @@ private static class OperationQueueMatchListener implements MatchListener { @SuppressWarnings("rawtypes") private final InterruptingPredicate onMatch; - private final Consumer setOnCancelHandler; private static final QueueEntry queueEntry = null; @SuppressWarnings("rawtypes") - OperationQueueMatchListener( - InterruptingPredicate onMatch, Consumer setOnCancelHandler) { + OperationQueueMatchListener(InterruptingPredicate onMatch) { this.onMatch = onMatch; - this.setOnCancelHandler = setOnCancelHandler; } @Override @@ -70,11 +65,6 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - setOnCancelHandler.accept(onCancelHandler); - } } private InterruptingPredicate createOnMatch( @@ -97,14 +87,10 @@ private InterruptingPredicate createOnMatch( @Override public void take(TakeOperationRequest request, StreamObserver responseObserver) { - ServerCallStreamObserver callObserver = - (ServerCallStreamObserver) responseObserver; - try { instance.match( request.getPlatform(), - new OperationQueueMatchListener( - createOnMatch(instance, responseObserver), callObserver::setOnCancelHandler)); + new OperationQueueMatchListener(createOnMatch(instance, responseObserver))); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } diff --git a/src/main/java/build/buildfarm/worker/MatchStage.java b/src/main/java/build/buildfarm/worker/MatchStage.java index dc81a2ce31..663aee3c52 100644 --- a/src/main/java/build/buildfarm/worker/MatchStage.java +++ b/src/main/java/build/buildfarm/worker/MatchStage.java @@ -95,11 +95,6 @@ public void onError(Throwable t) { throw new RuntimeException(t); } - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - // never called, only blocking stub used - } - @SuppressWarnings("SameReturnValue") private boolean onOperationPolled() throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 143fa09966..9f4ee3a29e 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -350,11 +350,6 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - listener.setOnCancelHandler(onCancelHandler); - } }; while (dedupMatchListener.getMatched()) { try { From c2c15449efddefad5e6bdd11107dbcc1e99f8f7b Mon Sep 17 00:00:00 2001 From: George Gensure Date: Wed, 4 Oct 2023 16:46:57 -0400 Subject: [PATCH 12/58] Update BWoB docs for ensureOutputsPresent --- _site/docs/execution/builds-without-the-bytes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_site/docs/execution/builds-without-the-bytes.md b/_site/docs/execution/builds-without-the-bytes.md index abda8c86b5..5c49025877 100644 --- a/_site/docs/execution/builds-without-the-bytes.md +++ b/_site/docs/execution/builds-without-the-bytes.md @@ -7,7 +7,7 @@ nav_order: 4 # Builds Without The Bytes -tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)` to your BWOB bazel invocations. +tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)`, to your BWOB bazel invocations, or enable `ensureOutputsPresent` in your config to set it globally. As proposed in this [issue](https://github.com/bazelbuild/bazel/issues/6862) and the accompanying document, bazel endeavors to provide a mechanism to be 'content-light' for remote execution, using only content reference addresses to request action execution and construct successively dependent action definitions. @@ -17,4 +17,4 @@ This puts BuildFarm in the uncomfortable position of never being able to expire To combat this, you can provide some metadata to buildfarm that will help to limit (but will not remove the possibility of) failed builds. -Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. \ No newline at end of file +Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. From 09015c9abfd02015ed3fdf88e6a72ec143a83eff Mon Sep 17 00:00:00 2001 From: Yun Peng Date: Fri, 6 Oct 2023 13:34:21 +0000 Subject: [PATCH 13/58] Disable Bzlmod explicitly in .bazelrc --- .bazelrc | 4 ++++ MODULE.bazel | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 MODULE.bazel diff --git a/.bazelrc b/.bazelrc index cfea58ba6a..adbb3c68fd 100644 --- a/.bazelrc +++ b/.bazelrc @@ -14,3 +14,7 @@ test --test_tag_filters=-redis,-integration # Ensure buildfarm is compatible with future versions of bazel. # https://buildkite.com/bazel/bazelisk-plus-incompatible-flags common --incompatible_disallow_empty_glob + +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 +common --noenable_bzlmod diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..cabbbe697c --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,2 @@ +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 From 1041b57e55cba97d41657b094384a9725c45d5b5 Mon Sep 17 00:00:00 2001 From: Anshuman Mishra Date: Thu, 28 Sep 2023 21:55:15 -0700 Subject: [PATCH 14/58] Log write errors with worker address --- .../buildfarm/common/grpc/StubWriteOutputStream.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java index 26b67f5c6c..4e500398b2 100644 --- a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java +++ b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java @@ -227,6 +227,14 @@ public void onNext(WriteResponse response) { @Override public void onError(Throwable t) { + log.log( + WARNING, + format( + "%s: write(%s) on worker %s after %d bytes of content", + Status.fromThrowable(t).getCode().name(), + resourceName, + bsStub.get().getChannel().authority(), + writtenBytes)); writeFuture.setException(exceptionTranslator.apply(t)); } From 30412adf9bcfa72734bd9fb3ac4b13bda397b9d2 Mon Sep 17 00:00:00 2001 From: Yuriy Belenitsky Date: Sun, 8 Oct 2023 08:25:16 -0400 Subject: [PATCH 15/58] Revert "Use integer ids for Sqlite bidirectional index" This reverts commit f651cdbf7b2203e8cdb5645a1f869589e42f28c3. --- .../cas/cfc/SqliteFileDirectoriesIndex.java | 64 ++++--------------- .../cas/cfc/DirectoriesIndexTest.java | 1 - 2 files changed, 11 insertions(+), 54 deletions(-) diff --git a/src/main/java/build/buildfarm/cas/cfc/SqliteFileDirectoriesIndex.java b/src/main/java/build/buildfarm/cas/cfc/SqliteFileDirectoriesIndex.java index b18b5bd22c..030c749037 100644 --- a/src/main/java/build/buildfarm/cas/cfc/SqliteFileDirectoriesIndex.java +++ b/src/main/java/build/buildfarm/cas/cfc/SqliteFileDirectoriesIndex.java @@ -57,19 +57,13 @@ private void open() { throw new RuntimeException(e); } - String createDirectoriesSql = - "CREATE TABLE directories (id INTEGER PRIMARY KEY, name VARCHAR UNIQUE)"; - String createFilesSql = "CREATE TABLE files (id INTEGER PRIMARY KEY, name VARCHAR UNIQUE)"; String createEntriesSql = "CREATE TABLE entries (\n" - + " file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,\n" - + " directory_id INTEGER NOT NULL REFERENCES directories(id) ON DELETE CASCADE,\n" - + " PRIMARY KEY (file_id, directory_id)\n" + + " path TEXT NOT NULL,\n" + + " directory TEXT NOT NULL\n" + ")"; try (Statement stmt = conn.createStatement()) { - stmt.execute(createDirectoriesSql); - stmt.execute(createFilesSql); stmt.execute(createEntriesSql); } catch (SQLException e) { throw new RuntimeException(e); @@ -83,13 +77,11 @@ private void open() { public synchronized void start() { open(); - String createPathIndexSql = "CREATE INDEX files_name_idx ON entries (file_id)"; - String createDirectoryIndexSql = "CREATE INDEX directory_idx ON entries (directory_id)"; - String enforceForeignKeys = "PRAGMA foreign_keys=ON"; + String createPathIndexSql = "CREATE INDEX path_idx ON entries (path)"; + String createDirectoryIndexSql = "CREATE INDEX directory_idx ON entries (directory)"; try (Statement stmt = conn.createStatement()) { stmt.execute(createPathIndexSql); stmt.execute(createDirectoryIndexSql); - stmt.execute(enforceForeignKeys); } catch (SQLException e) { throw new RuntimeException(e); } @@ -109,8 +101,7 @@ public void close() { private Set removeEntryDirectories(String entry) { open(); - String selectSql = - "SELECT d.name as directory FROM files f INNER JOIN entries e ON f.id = e.file_id INNER JOIN directories d ON d.id = e.directory_id WHERE f.name = ?"; + String selectSql = "SELECT directory FROM entries WHERE path = ?"; ImmutableSet.Builder directoriesBuilder = ImmutableSet.builder(); try (PreparedStatement selectStatement = conn.prepareStatement(selectSql)) { @@ -125,7 +116,7 @@ private Set removeEntryDirectories(String entry) { } // all directories featuring this entry are now invalid ImmutableSet directories = directoriesBuilder.build(); - String deleteSql = "DELETE FROM directories where name = ?"; + String deleteSql = "DELETE FROM entries where directory = ?"; try (PreparedStatement deleteStatement = conn.prepareStatement(deleteSql)) { conn.setAutoCommit(false); for (Digest directory : directories) { @@ -137,14 +128,6 @@ private Set removeEntryDirectories(String entry) { } catch (SQLException e) { throw new RuntimeException(e); } - // clear out orphaned files - try (Statement orphanStatement = conn.createStatement()) { - String deleteOrphanSql = - "DELETE FROM files WHERE id in (SELECT id FROM files f LEFT JOIN entries e ON f.id = e.file_id WHERE e.file_id IS NULL)"; - orphanStatement.execute(deleteOrphanSql); - } catch (SQLException e) { - throw new RuntimeException(e); - } return directories; } @@ -158,37 +141,13 @@ public synchronized Set removeEntry(String entry) throws IOException { private synchronized void addEntriesDirectory(Set entries, Digest directory) { open(); - String directoryName = DigestUtil.toString(directory); - String filesInsertSql = "INSERT OR IGNORE INTO files (name) VALUES (?)"; - try (PreparedStatement filesInsertStatement = conn.prepareStatement(filesInsertSql)) { - conn.setAutoCommit(false); - for (String entry : entries) { - filesInsertStatement.setString(1, entry); - filesInsertStatement.addBatch(); - } - filesInsertStatement.executeBatch(); - conn.commit(); - } catch (SQLException e) { - throw new RuntimeException(e); - } - // should be novel directory - String directoriesInsertSql = "INSERT INTO directories (name) VALUES (?)"; - try (PreparedStatement directoriesInsertStatement = - conn.prepareStatement(directoriesInsertSql)) { - conn.setAutoCommit(false); - directoriesInsertStatement.setString(1, directoryName); - directoriesInsertStatement.executeUpdate(); - conn.commit(); - } catch (SQLException e) { - throw new RuntimeException(e); - } - String entriesInsertSql = - "INSERT INTO entries (file_id, directory_id) SELECT f.id, d.id FROM files f, directories d WHERE f.name = ? AND d.name = ?"; - try (PreparedStatement insertStatement = conn.prepareStatement(entriesInsertSql)) { + String digest = DigestUtil.toString(directory); + String insertSql = "INSERT INTO entries (path, directory) VALUES (?,?)"; + try (PreparedStatement insertStatement = conn.prepareStatement(insertSql)) { conn.setAutoCommit(false); + insertStatement.setString(2, digest); for (String entry : entries) { insertStatement.setString(1, entry); - insertStatement.setString(2, directoryName); insertStatement.addBatch(); } insertStatement.executeBatch(); @@ -209,9 +168,8 @@ private void removeEntriesDirectory(Digest directory) { open(); String digest = DigestUtil.toString(directory); - String deleteSql = "DELETE FROM directories WHERE name = ?"; + String deleteSql = "DELETE FROM entries WHERE directory = ?"; try (PreparedStatement deleteStatement = conn.prepareStatement(deleteSql)) { - conn.setAutoCommit(true); deleteStatement.setString(1, digest); deleteStatement.executeUpdate(); } catch (SQLException e) { diff --git a/src/test/java/build/buildfarm/cas/cfc/DirectoriesIndexTest.java b/src/test/java/build/buildfarm/cas/cfc/DirectoriesIndexTest.java index b4effa2bc8..3eec66def7 100644 --- a/src/test/java/build/buildfarm/cas/cfc/DirectoriesIndexTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/DirectoriesIndexTest.java @@ -51,7 +51,6 @@ protected DirectoriesIndexTest(Path root, DirectoriesIndexType type) { } else { throw new IllegalArgumentException("DirectoriesIndex type is not supported."); } - directoriesIndex.start(); } @Before From 83790688c9e589ea35b3526c71ed95899f417633 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:08:51 -0400 Subject: [PATCH 16/58] Common String.format for PipelineStage --- .../java/build/buildfarm/worker/PipelineStage.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/PipelineStage.java b/src/main/java/build/buildfarm/worker/PipelineStage.java index edad2bb572..ecc78f8e7e 100644 --- a/src/main/java/build/buildfarm/worker/PipelineStage.java +++ b/src/main/java/build/buildfarm/worker/PipelineStage.java @@ -14,6 +14,7 @@ package build.buildfarm.worker; +import static java.lang.String.format; import static java.util.concurrent.TimeUnit.MICROSECONDS; import com.google.common.base.Stopwatch; @@ -63,8 +64,7 @@ public void run() { // ignore } catch (Exception e) { getLogger() - .log( - Level.SEVERE, String.format("%s::run(): stage terminated due to exception", name), e); + .log(Level.SEVERE, format("%s::run(): stage terminated due to exception", name), e); } finally { boolean wasInterrupted = Thread.interrupted(); try { @@ -143,7 +143,7 @@ protected void iterate() throws InterruptedException { } private String logIterateId(String operationName) { - return String.format("%s::iterate(%s)", name, operationName); + return format("%s::iterate(%s)", name, operationName); } protected void start() { @@ -157,7 +157,7 @@ protected void start(String operationName) { protected void start(String operationName, String message) { // TODO to unary stage this.operationName = operationName; - getLogger().log(Level.FINER, String.format("%s: %s", logIterateId(operationName), message)); + getLogger().log(Level.FINER, format("%s: %s", logIterateId(operationName), message)); } protected void complete(String operationName, long usecs, long stallUSecs, boolean success) { @@ -169,7 +169,7 @@ protected void complete(String operationName, long usecs, long stallUSecs, Strin getLogger() .log( Level.FINER, - String.format( + format( "%s: %g ms (%g ms stalled) %s", logIterateId(operationName), usecs / 1000.0f, stallUSecs / 1000.0f, status)); } From fd20d134497061af3d7bd3e06c209a53a01559ae Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:18:10 -0400 Subject: [PATCH 17/58] Cleanup matched logic in SWC listener Continue the loop while we have *not* matched successfully and avoid a confusing inversion in getMatched() --- .../java/build/buildfarm/worker/shard/ShardWorkerContext.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 9f4ee3a29e..18ab0d8fe5 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -313,7 +313,7 @@ public void match(MatchListener listener) throws InterruptedException { @Override public boolean getMatched() { - return !matched; + return matched; } @Override @@ -351,7 +351,7 @@ public void onError(Throwable t) { throw new RuntimeException(t); } }; - while (dedupMatchListener.getMatched()) { + while (!dedupMatchListener.getMatched()) { try { matchInterruptible(dedupMatchListener); } catch (IOException e) { From 632fceb1b02a13f519a1a6ff64a403262bd9b05c Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:20:55 -0400 Subject: [PATCH 18/58] Refactor SWC matcher and clarify Nullable Distinguish the valid/unique/propagating methods of entry listening. --- .../buildfarm/worker/shard/ShardWorkerContext.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 18ab0d8fe5..d32731879c 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -92,6 +92,7 @@ import java.util.Map; import java.util.Stack; import java.util.logging.Level; +import javax.annotation.Nullable; import lombok.extern.java.Log; @Log @@ -327,20 +328,28 @@ public void onWaitEnd() { } @Override - public boolean onEntry(QueueEntry queueEntry) throws InterruptedException { + public boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException { if (queueEntry == null) { matched = true; return listener.onEntry(null); } + return onValidEntry(queueEntry); + } + + private boolean onValidEntry(QueueEntry queueEntry) throws InterruptedException { String operationName = queueEntry.getExecuteEntry().getOperationName(); if (activeOperations.putIfAbsent(operationName, queueEntry) != null) { log.log(Level.WARNING, "matched duplicate operation " + operationName); return false; } + return onUniqueEntry(queueEntry); + } + + private boolean onUniqueEntry(QueueEntry queueEntry) throws InterruptedException { matched = true; boolean success = listener.onEntry(queueEntry); if (!success) { - requeue(operationName); + requeue(queueEntry.getExecuteEntry().getOperationName()); } return success; } From 7385c74cf0c711a91d1130fa8f406dcb57a8dc60 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:30:43 -0400 Subject: [PATCH 19/58] Interrupt matchStage to induce prepare shutdown The only signal to a waiting match that will halt its current listen loop for a valid unique operation is an interrupt. --- .../java/build/buildfarm/worker/Pipeline.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/Pipeline.java b/src/main/java/build/buildfarm/worker/Pipeline.java index da4eaacfb6..6b2d4f2a7d 100644 --- a/src/main/java/build/buildfarm/worker/Pipeline.java +++ b/src/main/java/build/buildfarm/worker/Pipeline.java @@ -61,9 +61,12 @@ public void close() throws InterruptedException { /** Inform MatchStage to stop matching or picking up new Operations from queue. */ public void stopMatchingOperations() { - for (PipelineStage stage : stageClosePriorities.keySet()) { + for (Map.Entry entry : stageThreads.entrySet()) { + PipelineStage stage = entry.getKey(); if (stage instanceof MatchStage) { - ((MatchStage) stage).prepareForGracefulShutdown(); + MatchStage matchStage = (MatchStage) stage; + matchStage.prepareForGracefulShutdown(); + entry.getValue().interrupt(); return; } } @@ -134,15 +137,17 @@ private void join(boolean closeStage) throws InterruptedException { } } if (stageToClose != null && !stageToClose.isClosed()) { - log.log(Level.FINER, "Closing stage at priority " + maxPriority); + log.log(Level.FINER, "Closing stage " + stageToClose + " at priority " + maxPriority); stageToClose.close(); } } + boolean longStageWait = !closeStage; for (Map.Entry stageThread : stageThreads.entrySet()) { PipelineStage stage = stageThread.getKey(); Thread thread = stageThread.getValue(); try { - thread.join(closeStage ? 1 : 1000); + // 0 is wait forever, no instant wait + thread.join(longStageWait ? 1000 : 1); } catch (InterruptedException e) { if (!closeStage) { synchronized (this) { @@ -172,8 +177,8 @@ private void join(boolean closeStage) throws InterruptedException { + stageClosePriorities.get(stage)); thread.interrupt(); } + longStageWait = false; } - closeStage = false; for (PipelineStage stage : inactiveStages) { synchronized (this) { stageThreads.remove(stage); From 79cf64898a11c745834d9eabd6707f64e7acab4c Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 14:52:53 -0400 Subject: [PATCH 20/58] Specify example config with grpc target Distinguish target param with GRPC type storage from FILESYSTEM definition --- examples/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/config.yml b/examples/config.yml index f426e98547..901e8ff76d 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -110,7 +110,8 @@ worker: skipLoad: false hexBucketLevels: 0 execRootCopyFallback: false - target: + #- type: GRPC + # target: "grpc://host:port" executeStageWidth: 1 inputFetchStageWidth: 1 inputFetchDeadline: 60 From d73628b6a6dcaea90f93426ad4d9c203851c9120 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Tue, 3 Oct 2023 16:24:53 -0400 Subject: [PATCH 21/58] Remove SpringBoot usage Reinstate prior usage of LoggingMain for safe shutdown, with added release mechanism for interrupted processes. All invoked shutdowns are graceful, with vastly improved shutdown speed for empty workers waiting for pipeline stages. --- defs.bzl | 20 +-- .../build/buildfarm/common/LoggingMain.java | 28 ++-- src/main/java/build/buildfarm/server/BUILD | 6 - .../buildfarm/server/BuildFarmServer.java | 126 +++++++++------- .../java/build/buildfarm/worker/shard/BUILD | 5 - .../shard/ShutDownWorkerGracefully.java | 3 +- .../build/buildfarm/worker/shard/Worker.java | 139 ++++++++---------- 7 files changed, 145 insertions(+), 182 deletions(-) diff --git a/defs.bzl b/defs.bzl index 993e9d36f4..ec51856484 100644 --- a/defs.bzl +++ b/defs.bzl @@ -43,22 +43,8 @@ IO_GRPC_MODULES = [ ] COM_AWS_MODULES = [ - "secretsmanager", "s3", -] - -ORG_SPRING_MODULES = [ - "spring-beans", - "spring-core", - "spring-context", - "spring-web", -] - -ORG_SPRING_BOOT_MODULES = [ - "spring-boot-autoconfigure", - "spring-boot", - "spring-boot-starter-web", - "spring-boot-starter-thymeleaf", + "secretsmanager", ] def buildfarm_init(name = "buildfarm"): @@ -99,7 +85,6 @@ def buildfarm_init(name = "buildfarm"): "org.slf4j:slf4j-simple:1.7.35", "com.googlecode.json-simple:json-simple:1.1.1", "com.jayway.jsonpath:json-path:2.4.0", - "io.github.lognet:grpc-spring-boot-starter:4.5.4", "org.bouncycastle:bcprov-jdk15on:1.70", "net.jcip:jcip-annotations:1.0", ] + ["io.netty:netty-%s:4.1.94.Final" % module for module in IO_NETTY_MODULES] + @@ -122,9 +107,6 @@ def buildfarm_init(name = "buildfarm"): "org.openjdk.jmh:jmh-core:1.23", "org.openjdk.jmh:jmh-generator-annprocess:1.23", "org.redisson:redisson:3.13.1", - ] + ["org.springframework.boot:%s:2.7.4" % module for module in ORG_SPRING_BOOT_MODULES] + - ["org.springframework:%s:5.3.23" % module for module in ORG_SPRING_MODULES] + - [ "org.threeten:threetenbp:1.3.3", "org.xerial:sqlite-jdbc:3.34.0", "org.jetbrains:annotations:16.0.2", diff --git a/src/main/java/build/buildfarm/common/LoggingMain.java b/src/main/java/build/buildfarm/common/LoggingMain.java index 9a4d9bd2b0..4e60a6286a 100644 --- a/src/main/java/build/buildfarm/common/LoggingMain.java +++ b/src/main/java/build/buildfarm/common/LoggingMain.java @@ -7,24 +7,22 @@ public abstract class LoggingMain { protected abstract void onShutdown() throws InterruptedException; - class ShutdownThread extends Thread { - ShutdownThread(String applicationName) { - super(null, null, applicationName + "-Shutdown", 0); - } - - @Override - public void run() { - try { - LoggingMain.this.onShutdown(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } finally { - WaitingLogManager.release(); - } + private void shutdown() { + try { + onShutdown(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + WaitingLogManager.release(); } } protected LoggingMain(String applicationName) { - Runtime.getRuntime().addShutdownHook(new ShutdownThread(applicationName)); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + /* group=*/ null, + /* target=*/ this::shutdown, + /* name=*/ applicationName + "-Shutdown")); } } diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index 0d6d396dc8..eccbf4fda2 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -25,11 +25,5 @@ java_library( "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_bouncycastle_bcprov_jdk15on", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_thymeleaf", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", ], ) diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index 4ca669d5f1..c8ae55c83e 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -15,12 +15,13 @@ package build.buildfarm.server; import static build.buildfarm.common.io.Utils.formatIOError; -import static com.google.common.base.Preconditions.checkState; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; import static java.util.logging.Level.SEVERE; +import static java.util.logging.Level.WARNING; import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.GrpcMetrics; import build.buildfarm.common.grpc.TracingMetadataUtils.ServerHeadersInterceptor; @@ -36,7 +37,6 @@ import build.buildfarm.server.services.OperationQueueService; import build.buildfarm.server.services.OperationsService; import build.buildfarm.server.services.PublishBuildEventService; -import com.google.devtools.common.options.OptionsParsingException; import io.grpc.ServerBuilder; import io.grpc.ServerInterceptor; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; @@ -47,24 +47,16 @@ import java.io.File; import java.io.IOException; import java.security.Security; -import java.util.HashMap; -import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; +import java.util.concurrent.atomic.AtomicBoolean; import javax.naming.ConfigurationException; import lombok.extern.java.Log; import org.bouncycastle.jce.provider.BouncyCastleProvider; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.annotation.ComponentScan; @SuppressWarnings("deprecation") @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class BuildFarmServer { +public class BuildFarmServer extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -78,8 +70,13 @@ public class BuildFarmServer { private Instance instance; private HealthStatusManager healthStatusManager; private io.grpc.Server server; - private boolean stopping = false; private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); + private AtomicBoolean shutdownInitiated = new AtomicBoolean(true); + private AtomicBoolean released = new AtomicBoolean(true); + + BuildFarmServer() { + super("BuildFarmServer"); + } private ShardInstance createInstance() throws IOException, ConfigurationException, InterruptedException { @@ -87,11 +84,13 @@ private ShardInstance createInstance() configs.getServer().getName(), configs.getServer().getSession() + "-" + configs.getServer().getName(), new DigestUtil(configs.getDigestFunction()), - this::stop); + this::initiateShutdown); } public synchronized void start(ServerBuilder serverBuilder, String publicName) throws IOException, ConfigurationException, InterruptedException { + shutdownInitiated.set(false); + released.set(false); instance = createInstance(); healthStatusManager = new HealthStatusManager(); @@ -138,7 +137,6 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName log.info(String.format("%s initialized", configs.getServer().getSession())); - checkState(!stopping, "must not call start after stop"); instance.start(publicName); server.start(); @@ -148,38 +146,69 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName healthCheckMetric.labels("start").inc(); } - @PreDestroy - public void stop() { - synchronized (this) { - if (stopping) { - return; - } - stopping = true; + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); } - System.err.println("*** shutting down gRPC server since JVM is shutting down"); + } + + synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); + } + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); healthStatusManager.setStatus( HealthStatusManager.SERVICE_NAME_ALL_SERVICES, ServingStatus.NOT_SERVING); PrometheusPublisher.stopHttpServer(); healthCheckMetric.labels("stop").inc(); try { - if (server != null) { - server.shutdown(); - } + initiateShutdown(); instance.stop(); - server.awaitTermination(10, TimeUnit.SECONDS); + if (server != null && server.awaitTermination(10, TimeUnit.SECONDS)) { + server = null; + } } catch (InterruptedException e) { if (server != null) { server.shutdownNow(); + server = null; } + throw e; } if (!shutdownAndAwaitTermination(keepaliveScheduler, 10, TimeUnit.SECONDS)) { log.warning("could not shut down keepalive scheduler"); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); + } + + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); } - @PostConstruct - public void init() throws OptionsParsingException { + private void initiateShutdown() { + shutdownInitiated.set(true); + if (server != null) { + server.shutdown(); + } + } + + private void awaitTermination() throws InterruptedException { + while (!shutdownInitiated.get()) { + if (server != null && server.awaitTermination(1, TimeUnit.SECONDS)) { + server = null; + shutdownInitiated.set(true); + } + } + } + + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -187,35 +216,24 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); - try { - start( - ServerBuilder.forPort(configs.getServer().getPort()), - configs.getServer().getPublicName()); - } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); - } catch (InterruptedException e) { - System.err.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); - } - } - - public static void main(String[] args) throws ConfigurationException { configs = BuildfarmConfigs.loadServerConfigs(args); // Configure Spring - SpringApplication app = new SpringApplication(BuildFarmServer.class); - Map springConfig = new HashMap<>(); - - // Disable Logback - System.setProperty("org.springframework.boot.logging.LoggingSystem", "none"); - - app.setDefaultProperties(springConfig); + BuildFarmServer server = new BuildFarmServer(); try { - app.run(args); - } catch (Throwable t) { - log.log(SEVERE, "Error running application", t); + server.start( + ServerBuilder.forPort(configs.getServer().getPort()), + configs.getServer().getPublicName()); + server.awaitTermination(); + } catch (IOException e) { + log.severe("error: " + formatIOError(e)); + } catch (InterruptedException e) { + log.log(WARNING, "interrupted", e); + } catch (Exception e) { + log.log(SEVERE, "Error running application", e); + } finally { + server.stop(); } } } diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index 120365882b..ec48ec5be0 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -36,10 +36,5 @@ java_library( "@maven//:io_prometheus_simpleclient", "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", ], ) diff --git a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java index 711c3cf0d3..7d394c9731 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java +++ b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java @@ -18,7 +18,6 @@ import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequestResults; import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.stub.StreamObserver; -import java.util.concurrent.CompletableFuture; import lombok.extern.java.Log; @Log @@ -41,7 +40,7 @@ public void prepareWorkerForGracefulShutdown( PrepareWorkerForGracefulShutDownRequest request, StreamObserver responseObserver) { try { - CompletableFuture.runAsync(worker::prepareWorkerForGracefulShutdown); + worker.initiateShutdown(); responseObserver.onNext(PrepareWorkerForGracefulShutDownRequestResults.newBuilder().build()); responseObserver.onCompleted(); } catch (Exception e) { diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 9a919d9703..084e5df7bf 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -35,6 +35,7 @@ import build.buildfarm.common.BuildfarmExecutors; import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Cas; import build.buildfarm.common.config.GrpcMetrics; @@ -60,7 +61,6 @@ import build.buildfarm.worker.SuperscalarPipelineStage; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; -import com.google.devtools.common.options.OptionsParsingException; import com.google.longrunning.Operation; import com.google.protobuf.ByteString; import com.google.protobuf.Duration; @@ -85,20 +85,14 @@ import java.util.UUID; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import javax.annotation.Nullable; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; import javax.naming.ConfigurationException; import lombok.extern.java.Log; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.annotation.ComponentScan; @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class Worker { +public final class Worker extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -139,53 +133,10 @@ public class Worker { private Pipeline pipeline; private Backplane backplane; private LoadingCache workerStubs; + private AtomicBoolean released = new AtomicBoolean(true); - /** - * The method will prepare the worker for graceful shutdown when the worker is ready. Note on - * using stderr here instead of log. By the time this is called in PreDestroy, the log is no - * longer available and is not logging messages. - */ - public void prepareWorkerForGracefulShutdown() { - if (configs.getWorker().getGracefulShutdownSeconds() == 0) { - System.err.println( - String.format( - "Graceful Shutdown is not enabled. Worker is shutting down without finishing executions in progress.")); - } else { - inGracefulShutdown = true; - System.err.println( - "Graceful Shutdown - The current worker will not be registered again and should be shutdown gracefully!"); - pipeline.stopMatchingOperations(); - int scanRate = 30; // check every 30 seconds - int timeWaited = 0; - int timeOut = configs.getWorker().getGracefulShutdownSeconds(); - - try { - if (pipeline.isEmpty()) { - System.err.println("Graceful Shutdown - no work in the pipeline."); - } else { - System.err.println( - String.format("Graceful Shutdown - waiting for executions to finish.")); - } - while (!pipeline.isEmpty() && timeWaited < timeOut) { - SECONDS.sleep(scanRate); - timeWaited += scanRate; - System.err.println( - String.format( - "Graceful Shutdown - Pipeline is still not empty after %d seconds.", timeWaited)); - } - } catch (InterruptedException e) { - System.err.println( - "Graceful Shutdown - The worker gracefully shutdown is interrupted: " + e.getMessage()); - } finally { - System.err.println( - String.format( - "Graceful Shutdown - It took the worker %d seconds to %s", - timeWaited, - pipeline.isEmpty() - ? "finish all actions" - : "gracefully shutdown but still cannot finish all actions")); - } - } + private Worker() { + super("BuildFarmShardWorker"); } private Operation stripOperation(Operation operation) { @@ -508,13 +459,6 @@ public void run() { } } catch (InterruptedException e) { // ignore - } finally { - try { - stop(); - } catch (InterruptedException ie) { - log.log(SEVERE, "interrupted while stopping worker", ie); - // ignore - } } } }, @@ -534,6 +478,7 @@ private long loadWorkerStartTimeInMillis() { } public void start() throws ConfigurationException, InterruptedException, IOException { + released.set(false); String session = UUID.randomUUID().toString(); ServerBuilder serverBuilder = ServerBuilder.forPort(configs.getWorker().getPort()); String identifier = "buildfarm-worker-" + configs.getWorker().getPublicName() + "-" + session; @@ -642,10 +587,41 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep log.log(INFO, String.format("%s initialized", identifier)); } - @PreDestroy - public void stop() throws InterruptedException { - System.err.println("*** shutting down gRPC server since JVM is shutting down"); - prepareWorkerForGracefulShutdown(); + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); + } + + private void awaitTermination() throws InterruptedException { + pipeline.join(); + server.awaitTermination(); + } + + public void initiateShutdown() { + pipeline.stopMatchingOperations(); + if (server != null) { + server.shutdown(); + } + } + + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); + } + } + + public synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); + } + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); PrometheusPublisher.stopHttpServer(); boolean interrupted = Thread.interrupted(); if (pipeline != null) { @@ -663,11 +639,12 @@ public void stop() throws InterruptedException { executionSlotsTotal.set(0); inputFetchSlotsTotal.set(0); if (execFileSystem != null) { - log.log(INFO, "Stopping exec filesystem"); + log.info("Stopping exec filesystem"); execFileSystem.stop(); + execFileSystem = null; } if (server != null) { - log.log(INFO, "Shutting down the server"); + log.info("Shutting down the server"); server.shutdown(); try { @@ -678,26 +655,28 @@ public void stop() throws InterruptedException { } finally { server.shutdownNow(); } + server = null; } if (backplane != null) { try { backplane.stop(); + backplane = null; } catch (InterruptedException e) { interrupted = true; } } if (workerStubs != null) { workerStubs.invalidateAll(); + workerStubs = null; } if (interrupted) { Thread.currentThread().interrupt(); throw new InterruptedException(); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); } - @PostConstruct - public void init() throws OptionsParsingException { + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -705,19 +684,17 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); + configs = BuildfarmConfigs.loadWorkerConfigs(args); + Worker worker = new Worker(); try { - start(); + worker.start(); + worker.awaitTermination(); } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); + log.severe(formatIOError(e)); } catch (InterruptedException e) { - System.out.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); + log.log(Level.WARNING, "interrupted", e); + } finally { + worker.stop(); } } - - public static void main(String[] args) throws ConfigurationException { - configs = BuildfarmConfigs.loadWorkerConfigs(args); - SpringApplication.run(Worker.class, args); - } } From 6f88e3424c3b2bb20e381a7358298698c72da559 Mon Sep 17 00:00:00 2001 From: Yuriy Belenitsky Date: Tue, 10 Oct 2023 11:06:01 -0400 Subject: [PATCH 22/58] Enable graceful shutdown for server (#1490) --- _site/docs/configuration/configuration.md | 48 ++++++++++--------- examples/config.yml | 1 + .../build/buildfarm/common/config/Server.java | 1 + .../buildfarm/server/BuildFarmServer.java | 32 +++++++++++++ 4 files changed, 59 insertions(+), 23 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index cbd286587f..ac74bb3f1e 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -51,29 +51,31 @@ worker: ### Server -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|----------------------------------|-------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | -| name | String, _shard_ | | Implementation name | -| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | -| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | -| port | Integer, _8980_ | | Listening port of the GRPC server | -| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | -| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | -| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | -| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | -| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | -| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | -| runOperationQueuer | boolean, _true_ | | Aquire execute request entries cooperatively from an arrival queue on the backplane | -| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | -| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | -| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | -| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | -| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | -| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | -| recordBesEvents | boolean, _false_ | | Allow recording of BES events | -| clusterId | String, _local_ | | Buildfarm cluster ID | -| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| Configuration | Accepted and _Default_ Values | Environment Var | Description | +|----------------------------------|-------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------| +| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | +| name | String, _shard_ | | Implementation name | +| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | +| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | +| port | Integer, _8980_ | | Listening port of the GRPC server | +| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | +| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | +| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | +| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | +| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | +| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | +| runOperationQueuer | boolean, _true_ | | Aquire execute request entries cooperatively from an arrival queue on the backplane | +| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | +| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | +| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | +| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | +| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | +| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | +| recordBesEvents | boolean, _false_ | | Allow recording of BES events | +| clusterId | String, _local_ | | Buildfarm cluster ID | +| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for connections in flight to finish when shutdown signal is received | + Example: diff --git a/examples/config.yml b/examples/config.yml index 901e8ff76d..cdf72f3de7 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -31,6 +31,7 @@ server: recordBesEvents: false clusterId: local cloudRegion: us-east-1 + gracefulShutdownSeconds: 0 caches: directoryCacheMaxEntries: 10000 commandCacheMaxEntries: 10000 diff --git a/src/main/java/build/buildfarm/common/config/Server.java b/src/main/java/build/buildfarm/common/config/Server.java index 9f2899ff60..0487154c2f 100644 --- a/src/main/java/build/buildfarm/common/config/Server.java +++ b/src/main/java/build/buildfarm/common/config/Server.java @@ -42,6 +42,7 @@ public enum INSTANCE_TYPE { private int maxInboundMetadataSize = 0; private ServerCacheConfigs caches = new ServerCacheConfigs(); private boolean findMissingBlobsViaBackplane = false; + private int gracefulShutdownSeconds = 0; public String getSession() { return String.format("buildfarm-server-%s-%s", getPublicName(), sessionGuid); diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index c8ae55c83e..a20f4a6f77 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -17,6 +17,7 @@ import static build.buildfarm.common.io.Utils.formatIOError; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; +import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.logging.Level.SEVERE; import static java.util.logging.Level.WARNING; @@ -78,6 +79,36 @@ public class BuildFarmServer extends LoggingMain { super("BuildFarmServer"); } + /** + * The method will prepare the server for graceful shutdown when the server is ready. Current + * implementation waits specified period of time. Future improvements could be to keep track of + * open connections and shutdown when there are not left. Note on using stderr here instead of + * log. By the time this is called in PreDestroy, the log is no longer available and is not + * logging messages. + */ + public void prepareServerForGracefulShutdown() { + if (configs.getServer().getGracefulShutdownSeconds() == 0) { + System.err.println( + String.format("Graceful Shutdown is not enabled. Server is shutting down immediately.")); + } else { + try { + System.err.println( + String.format( + "Graceful Shutdown - Waiting %d to allow connections to drain.", + configs.getServer().getGracefulShutdownSeconds())); + SECONDS.sleep(configs.getServer().getGracefulShutdownSeconds()); + } catch (InterruptedException e) { + System.err.println( + "Graceful Shutdown - The server graceful shutdown is interrupted: " + e.getMessage()); + } finally { + System.err.println( + String.format( + "Graceful Shutdown - It took the server %d seconds to shutdown", + configs.getServer().getGracefulShutdownSeconds())); + } + } + } + private ShardInstance createInstance() throws IOException, ConfigurationException, InterruptedException { return new ShardInstance( @@ -163,6 +194,7 @@ synchronized void stop() throws InterruptedException { private void shutdown() throws InterruptedException { log.info("*** shutting down gRPC server since JVM is shutting down"); + prepareServerForGracefulShutdown(); healthStatusManager.setStatus( HealthStatusManager.SERVICE_NAME_ALL_SERVICES, ServingStatus.NOT_SERVING); PrometheusPublisher.stopHttpServer(); From 81122e7fa455afb1267380cab8658ebedbd8b845 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Fri, 6 Oct 2023 16:00:03 -0700 Subject: [PATCH 23/58] refactor: code cleanup Tiny code cleanup --- .../buildfarm/instance/shard/JedisClusterFactory.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java index 5f85da29f6..2bfb48ac9e 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java @@ -58,9 +58,7 @@ public static Supplier create(String identifier) throws Configurat list2Set(redisNodes), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } @@ -70,9 +68,7 @@ public static Supplier create(String identifier) throws Configurat parseUri(configs.getBackplane().getRedisUri()), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } From ad44854942cab540ebf8c208e16bfe8ecc225c8f Mon Sep 17 00:00:00 2001 From: George Gensure Date: Thu, 22 Jun 2023 09:20:08 -0400 Subject: [PATCH 24/58] Log paths created on putDirectory Will include operation root and inform directory cache effectiveness. --- .../build/buildfarm/cas/cfc/CASFileCache.java | 28 +++++++++++--- .../worker/shard/CFCExecFileSystem.java | 37 ++++++++++--------- .../buildfarm/cas/cfc/CASFileCacheTest.java | 3 +- 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index 6b6e63b4ad..aac1b28994 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -2173,7 +2173,7 @@ private Directory getDirectoryFromDigest( return directory; } - public ListenableFuture putDirectory( + public ListenableFuture putDirectory( Digest digest, Map directoriesIndex, ExecutorService service) { // Claim lock. // Claim the directory path so no other threads try to create/delete it. @@ -2189,7 +2189,7 @@ public ListenableFuture putDirectory( log.log(Level.FINER, format("locked directory %s", path.getFileName())); // Now that a lock has been claimed, we can proceed to create the directory. - ListenableFuture putFuture; + ListenableFuture putFuture; try { putFuture = putDirectorySynchronized(path, digest, directoriesIndex, service); } catch (IOException e) { @@ -2244,8 +2244,26 @@ private boolean directoryEntryExists( return false; } + public static class PathResult { + private final Path path; + private final boolean missed; + + public PathResult(Path path, boolean missed) { + this.path = path; + this.missed = missed; + } + + public Path getPath() { + return path; + } + + public boolean getMissed() { + return missed; + } + } + @SuppressWarnings("ConstantConditions") - private ListenableFuture putDirectorySynchronized( + private ListenableFuture putDirectorySynchronized( Path path, Digest digest, Map directoriesByDigest, ExecutorService service) throws IOException { log.log(Level.FINER, format("directory %s has been locked", path.getFileName())); @@ -2277,7 +2295,7 @@ private ListenableFuture putDirectorySynchronized( if (e != null) { log.log(Level.FINER, format("found existing entry for %s", path.getFileName())); if (directoryEntryExists(path, e, directoriesByDigest)) { - return immediateFuture(path); + return immediateFuture(new PathResult(path, /* missed=*/ false)); } log.log( Level.SEVERE, @@ -2410,7 +2428,7 @@ private ListenableFuture putDirectorySynchronized( : directoriesByDigest.get(digest), Deadline.after(10, SECONDS)); directoryStorage.put(digest, e); - return path; + return new PathResult(path, /* missed=*/ true); }, service); } diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index e6d9826f45..74b238b2c6 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -265,6 +265,7 @@ private Iterable> fetchInputs( .map(symlinkNode -> putSymlink(path, symlinkNode)) .collect(ImmutableList.toImmutableList())); + ImmutableList.Builder> linkedDirectories = ImmutableList.builder(); for (DirectoryNode directoryNode : directory.getDirectoriesList()) { Digest digest = directoryNode.getDigest(); String name = directoryNode.getName(); @@ -285,26 +286,23 @@ private Iterable> fetchInputs( onKey, inputDirectories)); } else { - downloads = - concat( - downloads, - ImmutableList.of( - transform( - linkDirectory(dirPath, digest, directoriesIndex), - (result) -> { - // we saw null entries in the built immutable list without synchronization - synchronized (inputDirectories) { - inputDirectories.add(digest); - } - return null; - }, - fetchService))); + linkedDirectories.add( + transform( + linkDirectory(dirPath, digest, directoriesIndex), + (result) -> { + // we saw null entries in the built immutable list without synchronization + synchronized (inputDirectories) { + inputDirectories.add(digest); + } + return null; + }, + fetchService)); } if (Thread.currentThread().isInterrupted()) { break; } } - return downloads; + return concat(downloads, linkedDirectories.build()); } @SuppressWarnings("ConstantConditions") @@ -312,8 +310,13 @@ private ListenableFuture linkDirectory( Path execPath, Digest digest, Map directoriesIndex) { return transformAsync( fileCache.putDirectory(digest, directoriesIndex, fetchService), - (cachePath) -> { - Files.createSymbolicLink(execPath, cachePath); + pathResult -> { + Path path = pathResult.getPath(); + if (pathResult.getMissed()) { + log.fine( + String.format("putDirectory(%s, %s) created", path, DigestUtil.toString(digest))); + } + Files.createSymbolicLink(execPath, path); return immediateFuture(null); }, fetchService); diff --git a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java index 72e167b4df..a3d406e232 100644 --- a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java @@ -240,7 +240,8 @@ public void putDirectoryCreatesTree() throws IOException, InterruptedException { subdirDigest, subDirectory); Path dirPath = getInterruptiblyOrIOException( - fileCache.putDirectory(dirDigest, directoriesIndex, putService)); + fileCache.putDirectory(dirDigest, directoriesIndex, putService)) + .getPath(); assertThat(Files.isDirectory(dirPath)).isTrue(); assertThat(Files.exists(dirPath.resolve("file"))).isTrue(); assertThat(Files.isDirectory(dirPath.resolve("subdir"))).isTrue(); From b31ffedb5528c0f0fb2930473f802641f390a3d2 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Fri, 7 Jul 2023 01:49:58 -0400 Subject: [PATCH 25/58] Permit regex realInputDirectories Selecting realInputDirectories by regex permits flexible patterns that can yield drastic improvements in directory reuse for specialized deployments. runfiles in particular are hazardous expansions of nearly-execroot in the case of bazel. Care must be taken to match directories exclusively. The entire input tree is traversed for matches against expanded paths under the root, to allow for nested selection. Each match thus costs the number of input directories. Counterintuitively, OutputFiles are augmented to avoid the recursive check for OutputDirectories which only applies to actual reported results, resulting in a path match when creating the exec root. Regex style is java.util.Pattern, and must match the full input directory. --- .../worker/shard/CFCExecFileSystem.java | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 74b238b2c6..70a4acbb04 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -19,7 +19,6 @@ import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.Iterables.concat; -import static com.google.common.collect.Iterables.filter; import static com.google.common.util.concurrent.Futures.allAsList; import static com.google.common.util.concurrent.Futures.catchingAsync; import static com.google.common.util.concurrent.Futures.immediateFailedFuture; @@ -48,7 +47,7 @@ import build.buildfarm.worker.ExecDirException.ViolationException; import build.buildfarm.worker.OutputDirectory; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; +import com.google.common.collect.Iterables; import com.google.common.util.concurrent.ListenableFuture; import java.io.IOException; import java.io.InputStream; @@ -56,14 +55,16 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.UserPrincipal; -import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Stack; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.function.Consumer; import java.util.logging.Level; +import java.util.regex.Pattern; import javax.annotation.Nullable; import lombok.extern.java.Log; @@ -77,7 +78,7 @@ class CFCExecFileSystem implements ExecFileSystem { private final boolean linkInputDirectories; // override the symlinking above for a set of matching paths - private final Iterable realInputDirectories; + private final Iterable realInputDirectories; private final Map> rootInputFiles = new ConcurrentHashMap<>(); private final Map> rootInputDirectories = new ConcurrentHashMap<>(); @@ -98,7 +99,9 @@ class CFCExecFileSystem implements ExecFileSystem { this.fileCache = fileCache; this.owner = owner; this.linkInputDirectories = linkInputDirectories; - this.realInputDirectories = realInputDirectories; + this.realInputDirectories = + Iterables.transform( + realInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); this.removeDirectoryService = removeDirectoryService; this.accessRecorder = accessRecorder; } @@ -328,28 +331,61 @@ private static void checkExecErrors(Path path, List errors) throws Ex } } - private static boolean treeContainsPath( - String directoryPath, Map directoriesIndex, Digest rootDigest) { - Directory directory = directoriesIndex.get(rootDigest); - for (String name : directoryPath.split("/")) { - List subdirs = directory.getDirectoriesList(); - int index = Collections.binarySearch(Lists.transform(subdirs, DirectoryNode::getName), name); - if (index < 0) { - return false; + private static Iterator directoriesIterator( + Digest digest, Map directoriesIndex) { + Directory root = directoriesIndex.get(digest); + return new Iterator() { + boolean atEnd = root.getDirectoriesCount() == 0; + Stack path = new Stack<>(); + Stack> route = new Stack<>(); + Iterator current = root.getDirectoriesList().iterator(); + + @Override + public boolean hasNext() { + return !atEnd; } - directory = directoriesIndex.get(subdirs.get(index).getDigest()); - } - return true; + + @Override + public String next() { + String nextPath; + DirectoryNode next = current.next(); + String name = next.getName(); + path.push(name); + nextPath = String.join("/", path); + Digest digest = next.getDigest(); + if (digest.getSizeBytes() != 0) { + route.push(current); + current = directoriesIndex.get(digest).getDirectoriesList().iterator(); + } else { + path.pop(); + } + while (!current.hasNext() && !route.isEmpty()) { + current = route.pop(); + path.pop(); + } + atEnd = !current.hasNext(); + return nextPath; + } + }; } private Iterable realDirectories( Map directoriesIndex, Digest rootDigest) { // skip this search if all the directories are real if (linkInputDirectories) { - // somewhat inefficient, but would need many overrides to be painful - return filter( - realInputDirectories, - realInputDirectory -> treeContainsPath(realInputDirectory, directoriesIndex, rootDigest)); + ImmutableList.Builder builder = ImmutableList.builder(); + + Iterator dirs = directoriesIterator(rootDigest, directoriesIndex); + while (dirs.hasNext()) { + String dir = dirs.next(); + for (Pattern pattern : realInputDirectories) { + if (pattern.matcher(dir).matches()) { + builder.add(dir); + } + } + } + + return builder.build(); } return ImmutableList.of(); } @@ -361,10 +397,9 @@ public Path createExecDir( Digest inputRootDigest = action.getInputRootDigest(); OutputDirectory outputDirectory = OutputDirectory.parse( - command.getOutputFilesList(), concat( - command.getOutputDirectoriesList(), - realDirectories(directoriesIndex, inputRootDigest)), + command.getOutputFilesList(), realDirectories(directoriesIndex, inputRootDigest)), + command.getOutputDirectoriesList(), command.getEnvironmentVariablesList()); Path execDir = root.resolve(operationName); From 1dec6848e4141c714b0f854a94fbee3693da2a37 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Fri, 7 Jul 2023 02:08:18 -0400 Subject: [PATCH 26/58] Log execPath rather than the cache dir path This will include the path to the missed directory and the operation which required it. --- .../java/build/buildfarm/worker/shard/CFCExecFileSystem.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 70a4acbb04..2944f30d0d 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -317,7 +317,8 @@ private ListenableFuture linkDirectory( Path path = pathResult.getPath(); if (pathResult.getMissed()) { log.fine( - String.format("putDirectory(%s, %s) created", path, DigestUtil.toString(digest))); + String.format( + "putDirectory(%s, %s) created", execPath, DigestUtil.toString(digest))); } Files.createSymbolicLink(execPath, path); return immediateFuture(null); From 6af857ad318180ee9bd574429990e774d780f1c7 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Fri, 7 Jul 2023 11:14:30 -0400 Subject: [PATCH 27/58] Shore up OutputDirectory for silence on duplicates Prevent adding duplicate realInputDirectories matches --- src/main/java/build/buildfarm/worker/OutputDirectory.java | 5 +++++ .../java/build/buildfarm/worker/shard/CFCExecFileSystem.java | 1 + 2 files changed, 6 insertions(+) diff --git a/src/main/java/build/buildfarm/worker/OutputDirectory.java b/src/main/java/build/buildfarm/worker/OutputDirectory.java index 6245553803..caf77b273e 100644 --- a/src/main/java/build/buildfarm/worker/OutputDirectory.java +++ b/src/main/java/build/buildfarm/worker/OutputDirectory.java @@ -161,10 +161,15 @@ private static OutputDirectory parseDirectories(Iterable o Iterables.addAll(sortedOutputDirs, outputDirs); Collections.sort(sortedOutputDirs); + String currentOutputDir = ""; Builder currentBuilder = builder; String prefix = "/"; for (OutputDirectoryEntry entry : sortedOutputDirs) { String outputDir = entry.outputDirectory; + if (outputDir == currentOutputDir) { + continue; + } + currentOutputDir = outputDir; while (!outputDir.startsWith(prefix)) { currentBuilder = stack.pop(); int upPathSeparatorIndex = prefix.lastIndexOf('/', prefix.length() - 2); diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 2944f30d0d..d73b977916 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -382,6 +382,7 @@ private Iterable realDirectories( for (Pattern pattern : realInputDirectories) { if (pattern.matcher(dir).matches()) { builder.add(dir); + break; // avoid adding the same directory twice } } } From 71a928c73f7ba69f7533d13821de0d9bf72c36d9 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Fri, 7 Jul 2023 16:31:06 -0400 Subject: [PATCH 28/58] Trigger realInputDirectories to have empty files Ensure that the last leg of the execution presents a directory, rather than the parent, per OutputDirectory's stamping. --- .../java/build/buildfarm/worker/shard/CFCExecFileSystem.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index d73b977916..94c232a942 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -381,7 +381,7 @@ private Iterable realDirectories( String dir = dirs.next(); for (Pattern pattern : realInputDirectories) { if (pattern.matcher(dir).matches()) { - builder.add(dir); + builder.add(dir + "/"); break; // avoid adding the same directory twice } } From f8017bd9aaa513dc0f91d52b18d06e2a6379d410 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Sat, 8 Jul 2023 10:02:43 -0400 Subject: [PATCH 29/58] Switch to positive check for linkInputDirectories --- examples/config.yml | 4 +- .../build/buildfarm/common/config/Worker.java | 2 +- .../worker/shard/CFCExecFileSystem.java | 39 ++++++++++++------- .../build/buildfarm/worker/shard/Worker.java | 2 +- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/examples/config.yml b/examples/config.yml index cdf72f3de7..bce19a3d60 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -117,8 +117,8 @@ worker: inputFetchStageWidth: 1 inputFetchDeadline: 60 linkInputDirectories: true - realInputDirectories: - - "external" + linkedInputDirectories: + - "(?!external)[^/]+" execOwner: defaultMaxCores: 0 limitGlobalExecution: false diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index 4caaff2b02..1661df9bfe 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -28,7 +28,7 @@ public class Worker { private int inputFetchStageWidth = 0; private int inputFetchDeadline = 60; private boolean linkInputDirectories = true; - private List realInputDirectories = Arrays.asList("external"); + private List linkedInputDirectories = Arrays.asList("(?!external)[^/]+"); private String execOwner; private int defaultMaxCores = 0; private boolean limitGlobalExecution = false; diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 94c232a942..041868f69c 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -47,6 +47,7 @@ import build.buildfarm.worker.ExecDirException.ViolationException; import build.buildfarm.worker.OutputDirectory; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.util.concurrent.ListenableFuture; import java.io.IOException; @@ -58,6 +59,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.Stack; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; @@ -77,8 +79,8 @@ class CFCExecFileSystem implements ExecFileSystem { // perform first-available non-output symlinking and retain directories in cache private final boolean linkInputDirectories; - // override the symlinking above for a set of matching paths - private final Iterable realInputDirectories; + // indicate symlinking above for a set of matching paths + private final Iterable linkedInputDirectories; private final Map> rootInputFiles = new ConcurrentHashMap<>(); private final Map> rootInputDirectories = new ConcurrentHashMap<>(); @@ -92,16 +94,16 @@ class CFCExecFileSystem implements ExecFileSystem { CASFileCache fileCache, @Nullable UserPrincipal owner, boolean linkInputDirectories, - Iterable realInputDirectories, + Iterable linkedInputDirectories, ExecutorService removeDirectoryService, ExecutorService accessRecorder) { this.root = root; this.fileCache = fileCache; this.owner = owner; this.linkInputDirectories = linkInputDirectories; - this.realInputDirectories = + this.linkedInputDirectories = Iterables.transform( - realInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); + linkedInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); this.removeDirectoryService = removeDirectoryService; this.accessRecorder = accessRecorder; } @@ -239,6 +241,7 @@ private Iterable> fetchInputs( Digest directoryDigest, Map directoriesIndex, OutputDirectory outputDirectory, + Set linkedInputDirectories, Consumer onKey, ImmutableList.Builder inputDirectories) throws IOException { @@ -275,7 +278,9 @@ private Iterable> fetchInputs( OutputDirectory childOutputDirectory = outputDirectory != null ? outputDirectory.getChild(name) : null; Path dirPath = path.resolve(name); - if (childOutputDirectory != null || !linkInputDirectories) { + if (childOutputDirectory != null + || !linkInputDirectories + || !linkedInputDirectories.contains(dirPath)) { Files.createDirectories(dirPath); downloads = concat( @@ -286,6 +291,7 @@ private Iterable> fetchInputs( digest, directoriesIndex, childOutputDirectory, + linkedInputDirectories, onKey, inputDirectories)); } else { @@ -370,26 +376,25 @@ public String next() { }; } - private Iterable realDirectories( + private Set linkedDirectories( Map directoriesIndex, Digest rootDigest) { // skip this search if all the directories are real if (linkInputDirectories) { - ImmutableList.Builder builder = ImmutableList.builder(); + ImmutableSet.Builder builder = ImmutableSet.builder(); Iterator dirs = directoriesIterator(rootDigest, directoriesIndex); while (dirs.hasNext()) { String dir = dirs.next(); - for (Pattern pattern : realInputDirectories) { + for (Pattern pattern : linkedInputDirectories) { if (pattern.matcher(dir).matches()) { - builder.add(dir + "/"); + builder.add(dir); break; // avoid adding the same directory twice } } } - return builder.build(); } - return ImmutableList.of(); + return ImmutableSet.of(); } @Override @@ -399,8 +404,7 @@ public Path createExecDir( Digest inputRootDigest = action.getInputRootDigest(); OutputDirectory outputDirectory = OutputDirectory.parse( - concat( - command.getOutputFilesList(), realDirectories(directoriesIndex, inputRootDigest)), + command.getOutputFilesList(), command.getOutputDirectoriesList(), command.getEnvironmentVariablesList()); @@ -413,6 +417,12 @@ public Path createExecDir( ImmutableList.Builder inputFiles = new ImmutableList.Builder<>(); ImmutableList.Builder inputDirectories = new ImmutableList.Builder<>(); + Set linkedInputDirectories = + ImmutableSet.copyOf( + Iterables.transform( + linkedDirectories(directoriesIndex, inputRootDigest), + path -> execDir.resolve(path))); // does this work on windows with / separators? + log.log( Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") calling fetchInputs"); Iterable> fetchedFutures = @@ -422,6 +432,7 @@ public Path createExecDir( inputRootDigest, directoriesIndex, outputDirectory, + linkedInputDirectories, key -> { synchronized (inputFiles) { inputFiles.add(key); diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 084e5df7bf..153df1a371 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -333,7 +333,7 @@ private ExecFileSystem createCFCExecFileSystem( fileCache, owner, configs.getWorker().isLinkInputDirectories(), - configs.getWorker().getRealInputDirectories(), + configs.getWorker().getLinkedInputDirectories(), removeDirectoryService, accessRecorder /* deadlineAfter=*/ From 5d88e544e54d028c6b7af485aba5bac5b32b11c4 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 12 Oct 2023 10:48:23 -0700 Subject: [PATCH 30/58] docs(configuration): document --prometheus_port CLI argument --- _site/docs/configuration/configuration.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index ac74bb3f1e..21956e7b3d 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -28,13 +28,13 @@ For an example config containing all of the configuration values, see `examples/ ### Common -| Configuration | Accepted and _Default_ Values | Description | -|----------------------|-------------------------------|---------------------------------------------------| -| digestFunction | _SHA256_, SHA1 | Digest function for this implementation | -| defaultActionTimeout | Integer, _600_ | Default timeout value for an action (seconds) | -| maximumActionTimeout | Integer, _3600_ | Maximum allowed action timeout (seconds) | -| maxEntrySizeBytes | Long, _2147483648_ | Maximum size of a single blob accepted (bytes) | -| prometheusPort | Integer, _9090_ | Listening port of the Prometheus metrics endpoint | +| Configuration | Accepted and _Default_ Values | Command Line Argument | Description | +|----------------------|-------------------------------|-----------------------|---------------------------------------------------| +| digestFunction | _SHA256_, SHA1 | | Digest function for this implementation | +| defaultActionTimeout | Integer, _600_ | | Default timeout value for an action (seconds) | +| maximumActionTimeout | Integer, _3600_ | | Maximum allowed action timeout (seconds) | +| maxEntrySizeBytes | Long, _2147483648_ | | Maximum size of a single blob accepted (bytes) | +| prometheusPort | Integer, _9090_ | --prometheus_port | Listening port of the Prometheus metrics endpoint | Example: From 21448c3954ff70268c9a8e21871c0fdd7f44767a Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 12 Oct 2023 10:48:54 -0700 Subject: [PATCH 31/58] docs(configuration): readability and typos --- _site/docs/configuration/configuration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index 21956e7b3d..9a806ef336 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -21,8 +21,8 @@ worker: publicName: "localhost:8981" ``` -The configuration can be provided to the server and worker as a CLI argument or through the env variable `CONFIG_PATH` -For an example config containing all of the configuration values, see `examples/config.yml`. +The configuration can be provided to the server and worker as a CLI argument or through the environment variable `CONFIG_PATH` +For an example configuration containing all of the configuration values, see `examples/config.yml`. ## All Configurations @@ -64,7 +64,7 @@ worker: | sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | | runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | | dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | -| runOperationQueuer | boolean, _true_ | | Aquire execute request entries cooperatively from an arrival queue on the backplane | +| runOperationQueuer | boolean, _true_ | | Acquire execute request entries cooperatively from an arrival queue on the backplane | | ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | | maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | | maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | From 461cc5a637d9b5617534dbc39b02e78b9a042312 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 12 Oct 2023 10:49:21 -0700 Subject: [PATCH 32/58] style(configuration.md): table formatting --- _site/docs/configuration/configuration.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index 9a806ef336..2c08e8598e 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -105,12 +105,12 @@ server: ### Server Caches -| Configuration | Accepted and _Default_ Values | Description | -|--------------------------|-------------------------------|--------------------------------------------------------| -| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | -| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | -| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | -| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | +| Configuration | Accepted and _Default_ Values | Description | +|---------------------------------------|-------------------------------|----------------------------------------------------------------------| +| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | +| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | +| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | +| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | Example: From b74531c7c6dd74a83ccb1d07c4b8ce9ba8d66d4d Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Fri, 6 Oct 2023 16:01:26 -0700 Subject: [PATCH 33/58] feat: support --redis_uri command line option Support a `--redis_uri` command line option for start-up. --- .../build/buildfarm/common/config/BuildfarmConfigs.java | 6 ++++++ .../build/buildfarm/common/config/BuildfarmOptions.java | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 1a15e40803..444d25a13d 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -80,6 +80,9 @@ public static BuildfarmConfigs loadServerConfigs(String[] args) throws Configura if (options.prometheusPort >= 0) { buildfarmConfigs.setPrometheusPort(options.prometheusPort); } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustServerConfigs(buildfarmConfigs); return buildfarmConfigs; } @@ -99,6 +102,9 @@ public static BuildfarmConfigs loadWorkerConfigs(String[] args) throws Configura if (options.prometheusPort >= 0) { buildfarmConfigs.setPrometheusPort(options.prometheusPort); } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustWorkerConfigs(buildfarmConfigs); return buildfarmConfigs; } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java index defeebbf48..1dbe0c7f98 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java @@ -27,4 +27,10 @@ public class BuildfarmOptions extends OptionsBase { help = "Port for the prometheus service. '0' will disable prometheus hosting", defaultValue = "-1") public int prometheusPort; + + @Option( + name = "redis_uri", + help = "URI for Redis connection. Use 'redis://' or 'rediss://' for the scheme", + defaultValue = "") + public String redisUri; } From d860b277f891222dd83a0cc746a193863e60d715 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 12 Oct 2023 10:46:17 -0700 Subject: [PATCH 34/58] docs(configuration): document the --redis_uri command line options also fixed some spelling typos. --- _site/docs/configuration/configuration.md | 70 +++++++++++------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index 2c08e8598e..ae913adf88 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -169,41 +169,41 @@ server: ### Redis Backplane -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|------------------------------|------------------------------------------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | _SHARD_ | | Type of backplane. Currently, the only implemntation is SHARD utilizing Redis | -| redisUri | String, redis://localhost:6379 | REDIS_URI | Redis cluster endpoint. This must be a single URI | -| redisPassword | String, _null_ | | Redis password, if applicable | -| redisNodes | List of Strings, _null_ | | List of individual Redis nodes, if applicable | -| jedisPoolMaxTotal | Integer, _4000_ | | The size of the Redis connection pool | -| workersHashName | String, _Workers_ | | Redis key used to store a hash of registered workers | -| workerChannel | String, _WorkerChannel_ | | Redis pubsub channel key where changes of the cluster membership are announced | -| actionCachePrefix | String, _ActionCache_ | | Redis key prefix for all ActionCache entries | -| actionCacheExpire | Integer, _2419200_ | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | -| actionBlacklistPrefix | String, _ActionBlacklist_ | | Redis key prefix for all blacklisted actions, which are rejected | -| actionBlacklistExpire | Integer, _3600_ | | The TTL maintained for action blacklist entries | -| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | -| operationPrefix | String, _Operation_ | | Redis key prefix for all operations, suffixed wtih the operation's name | -| operationExpire | Integer, _604800_ | | The TTL maintained for all operations, updated on each modification | -| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | -| processingListName | String, _{Arrival}:ProcessingOperations_ | | Redis key of a list used to ensure reliable processing of arrival queue etries with operation watch monitoring | -| processingPrefix | String, _Processing_ | | Redis key prefix for operations which are being dequeued from the arrival queue | -| processingTimeoutMillis | Integer, _20000_ | | Delay (in ms) used to populate processing operation entries | -| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | Redis key used to store a list of QueueEntry awaiting execution by workers | -| dispatchingPrefix | String, _Dispatching_ | | Redis key prefix for operations which are being dequeued from the ready to run queue | -| dispatchingTimeoutMillis | Integer, _10000_ | | Delay (in ms) used to populate dispatching operation entries | -| dispatchedOperationsHashName | String, _DispatchedOperations_ | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | -| operationChannelPrefix | String, _OperationChannel_ | | Redis pubsub channel prefix suffixed by an operation name | -| casPrefix | String, _ContentAddressableStorage_ | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | -| casExpire | Integer, _604800_ | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | -| subscribeToBackplane | boolean, _true_ | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | -| runFailsafeOperation | boolean, _true_ | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | -| maxQueueDepth | Integer, _100000_ | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | -| maxPreQueueDepth | Integer, _1000000_ | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | -| priorityQueue | boolean, _false_ | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | -| timeout | Integer, _10000_ | | Default timeout | -| maxAttempts | Integer, _20_ | | Maximum number of execution attempts | -| cacheCas | boolean, _false_ | | | +| Configuration | Accepted and _Default_ Values | Environment Var | Command Line Argument | Description | +|------------------------------|------------------------------------------|-----------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | _SHARD_ | | | Type of backplane. Currently, the only implementation is SHARD utilizing Redis | +| redisUri | String, redis://localhost:6379 | REDIS_URI | --redis_uri | Redis cluster endpoint. This must be a single URI | +| redisPassword | String, _null_ | | | Redis password, if applicable | +| redisNodes | List of Strings, _null_ | | | List of individual Redis nodes, if applicable | +| jedisPoolMaxTotal | Integer, _4000_ | | | The size of the Redis connection pool | +| workersHashName | String, _Workers_ | | | Redis key used to store a hash of registered workers | +| workerChannel | String, _WorkerChannel_ | | | Redis pubsub channel key where changes of the cluster membership are announced | +| actionCachePrefix | String, _ActionCache_ | | | Redis key prefix for all ActionCache entries | +| actionCacheExpire | Integer, _2419200_ | | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | +| actionBlacklistPrefix | String, _ActionBlacklist_ | | | Redis key prefix for all blacklisted actions, which are rejected | +| actionBlacklistExpire | Integer, _3600_ | | | The TTL maintained for action blacklist entries | +| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | +| operationPrefix | String, _Operation_ | | | Redis key prefix for all operations, suffixed with the operation's name | +| operationExpire | Integer, _604800_ | | | The TTL maintained for all operations, updated on each modification | +| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | +| processingListName | String, _{Arrival}:ProcessingOperations_ | | | Redis key of a list used to ensure reliable processing of arrival queue entries with operation watch monitoring | +| processingPrefix | String, _Processing_ | | | Redis key prefix for operations which are being dequeued from the arrival queue | +| processingTimeoutMillis | Integer, _20000_ | | | Delay (in ms) used to populate processing operation entries | +| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | | Redis key used to store a list of QueueEntry awaiting execution by workers | +| dispatchingPrefix | String, _Dispatching_ | | | Redis key prefix for operations which are being dequeued from the ready to run queue | +| dispatchingTimeoutMillis | Integer, _10000_ | | | Delay (in ms) used to populate dispatching operation entries | +| dispatchedOperationsHashName | String, _DispatchedOperations_ | | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | +| operationChannelPrefix | String, _OperationChannel_ | | | Redis pubsub channel prefix suffixed by an operation name | +| casPrefix | String, _ContentAddressableStorage_ | | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | +| casExpire | Integer, _604800_ | | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | +| subscribeToBackplane | boolean, _true_ | | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | +| runFailsafeOperation | boolean, _true_ | | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | +| maxQueueDepth | Integer, _100000_ | | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | +| maxPreQueueDepth | Integer, _1000000_ | | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | +| priorityQueue | boolean, _false_ | | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | +| timeout | Integer, _10000_ | | | Default timeout | +| maxAttempts | Integer, _20_ | | | Maximum number of execution attempts | +| cacheCas | boolean, _false_ | | | | Example: From 10bf1398bfecc13804b865ffde0b0e9613c80c23 Mon Sep 17 00:00:00 2001 From: Jorge Pinto Sousa Date: Wed, 11 Oct 2023 16:28:41 +0200 Subject: [PATCH 35/58] Example should use `container_image` instead of `java_image` --- _site/docs/execution/environment.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_site/docs/execution/environment.md b/_site/docs/execution/environment.md index 4a17aabe6b..d2cbca0b51 100644 --- a/_site/docs/execution/environment.md +++ b/_site/docs/execution/environment.md @@ -116,7 +116,7 @@ Next we will create a BUILD file to create our target image. We will use the sha ``` load("@io_bazel_rules_docker//container:container.bzl", "container_image") -java_image( +container_image( name = "buildfarm-shard-worker-ubuntu20-java14", base = "@ubuntu20_java14_image_base//image", files = [ From 9d4c86c8021804aeb5f8723b4faf914963a95386 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 21 Sep 2023 22:16:27 -0700 Subject: [PATCH 36/58] chore: bump rules_jvm_external Bumping 4.2 -> 5.3 --- deps.bzl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deps.bzl b/deps.bzl index c31cf04edd..833fe1447e 100644 --- a/deps.bzl +++ b/deps.bzl @@ -5,8 +5,8 @@ buildfarm dependencies that can be imported into other WORKSPACE files load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file", "http_jar") load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") -RULES_JVM_EXTERNAL_TAG = "4.2" -RULES_JVM_EXTERNAL_SHA = "cd1a77b7b02e8e008439ca76fd34f5b07aecb8c752961f9640dea15e9e5ba1ca" +RULES_JVM_EXTERNAL_TAG = "5.3" +RULES_JVM_EXTERNAL_SHA = "d31e369b854322ca5098ea12c69d7175ded971435e55c18dd9dd5f29cc5249ac" def archive_dependencies(third_party): return [ @@ -22,7 +22,7 @@ def archive_dependencies(third_party): "name": "rules_jvm_external", "strip_prefix": "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG, "sha256": RULES_JVM_EXTERNAL_SHA, - "url": "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG, + "url": "https://github.com/bazelbuild/rules_jvm_external/releases/download/%s/rules_jvm_external-%s.tar.gz" % (RULES_JVM_EXTERNAL_TAG, RULES_JVM_EXTERNAL_TAG), }, # Kubernetes rules. Useful for local development with tilt. From 5bc4564590eba4585676ef411f093423ee6b1b19 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Thu, 21 Sep 2023 22:26:06 -0700 Subject: [PATCH 37/58] chore: bump rules_cc Bump fro 0.0.6 -> 0.0.9 --- deps.bzl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deps.bzl b/deps.bzl index 833fe1447e..f802fb27e7 100644 --- a/deps.bzl +++ b/deps.bzl @@ -92,9 +92,9 @@ def archive_dependencies(third_party): }, { "name": "rules_cc", - "sha256": "3d9e271e2876ba42e114c9b9bc51454e379cbf0ec9ef9d40e2ae4cec61a31b40", - "strip_prefix": "rules_cc-0.0.6", - "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.6/rules_cc-0.0.6.tar.gz", + "sha256": "2037875b9a4456dce4a79d112a8ae885bbc4aad968e6587dca6e64f3a0900cdf", + "strip_prefix": "rules_cc-0.0.9", + "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.9/rules_cc-0.0.9.tar.gz", }, # Used to format proto files From edf2211c086b839aa1bb1957245febc681685e2e Mon Sep 17 00:00:00 2001 From: Trevor Hickey Date: Mon, 16 Oct 2023 23:54:31 -0400 Subject: [PATCH 38/58] Implement local resources for workers (#1282) --- _site/docs/contribute/design-documents.md | 3 +- .../buildfarm/common/config/Backplane.java | 15 +- .../common/config/LimitedResource.java | 43 ++++ .../build/buildfarm/common/config/Worker.java | 6 + .../common/redis/BalancedRedisQueue.java | 11 + .../instance/shard/OperationQueue.java | 84 +++++++- src/main/java/build/buildfarm/worker/BUILD | 1 + .../worker/DequeueMatchEvaluator.java | 37 ++-- .../java/build/buildfarm/worker/Executor.java | 3 + .../build/buildfarm/worker/WorkerContext.java | 2 + .../build/buildfarm/worker/resources/BUILD | 1 + .../worker/resources/LocalResourceSet.java | 37 ++++ .../resources/LocalResourceSetMetrics.java | 46 +++++ .../resources/LocalResourceSetUtils.java | 120 +++++++++++ .../worker/shard/ShardWorkerContext.java | 22 +- .../build/buildfarm/worker/shard/Worker.java | 2 + .../common/config/GrpcMetricsTest.java | 1 - .../java/build/buildfarm/instance/shard/BUILD | 192 +++++++++++++++++- .../shard/RedisShardBackplaneTest.java | 6 +- .../instance/shard/ShardInstanceTest.java | 1 - .../worker/DequeueMatchEvaluatorTest.java | 168 ++++++++++++++- .../buildfarm/worker/StubWorkerContext.java | 5 + .../java/build/buildfarm/worker/shard/BUILD | 1 + .../worker/shard/ShardWorkerContextTest.java | 2 + 24 files changed, 759 insertions(+), 50 deletions(-) create mode 100644 src/main/java/build/buildfarm/common/config/LimitedResource.java create mode 100644 src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java create mode 100644 src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java create mode 100644 src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java diff --git a/_site/docs/contribute/design-documents.md b/_site/docs/contribute/design-documents.md index f7b1c0b80d..ea608696f3 100644 --- a/_site/docs/contribute/design-documents.md +++ b/_site/docs/contribute/design-documents.md @@ -5,4 +5,5 @@ parent: Contribute nav_order: 2 --- -[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) \ No newline at end of file +[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) +[Local and Global Resources](https://docs.google.com/document/d/1u0TkmVmdMS53PWR1hgh-a_cj3NmQYE0Favv9aGFfQZs/edit?usp=sharing) \ No newline at end of file diff --git a/src/main/java/build/buildfarm/common/config/Backplane.java b/src/main/java/build/buildfarm/common/config/Backplane.java index 5c21532ab4..cc15a6028a 100644 --- a/src/main/java/build/buildfarm/common/config/Backplane.java +++ b/src/main/java/build/buildfarm/common/config/Backplane.java @@ -1,6 +1,7 @@ package build.buildfarm.common.config; -import com.google.common.base.Strings; +import java.util.ArrayList; +import java.util.List; import lombok.AccessLevel; import lombok.Data; import lombok.Getter; @@ -52,13 +53,7 @@ public enum BACKPLANE_TYPE { private boolean cacheCas = false; private long priorityPollIntervalMillis = 100; - public String getRedisUri() { - // use environment override (useful for containerized deployment) - if (!Strings.isNullOrEmpty(System.getenv("REDIS_URI"))) { - return System.getenv("REDIS_URI"); - } - - // use configured value - return redisUri; - } + // These limited resources are shared across all workers. + // An example would be a limited number of seats to a license server. + private List resources = new ArrayList<>(); } diff --git a/src/main/java/build/buildfarm/common/config/LimitedResource.java b/src/main/java/build/buildfarm/common/config/LimitedResource.java new file mode 100644 index 0000000000..f3b09ff621 --- /dev/null +++ b/src/main/java/build/buildfarm/common/config/LimitedResource.java @@ -0,0 +1,43 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.config; + +import lombok.Data; + +/** + * @class Limited Resource + * @brief A fixed amount of a specific resource. + * @details We define a limited resource as a counting semaphore whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. + */ +@Data +public class LimitedResource { + /** + * @field name + * @brief The name of the resource. + * @details This should correspond to the platform property's key name: + * resources:: + */ + public String name; + + /** + * @field amount + * @brief The total amount of the resource that's available for use during execution. + * @details As a counting semaphore, this amount becomes the limit. + */ + public int amount = 1; +} diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index 1661df9bfe..e987c1e370 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -5,6 +5,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.naming.ConfigurationException; @@ -39,6 +40,11 @@ public class Worker { private ExecutionPolicy[] executionPolicies = {}; private SandboxSettings sandboxSettings = new SandboxSettings(); + // These limited resources are only for the individual worker. + // An example would be hardware resources such as GPUs. + // If you want GPU actions to run exclusively, define a single GPU resource. + private List resources = new ArrayList<>(); + public ExecutionPolicy[] getExecutionPolicies() { if (executionPolicies != null) { return executionPolicies; diff --git a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java index 52f937c8ed..9123f5eaf9 100644 --- a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java +++ b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java @@ -236,6 +236,17 @@ public String dequeue(JedisCluster jedis) throws InterruptedException { } } + /** + * @brief Pop element into internal dequeue and return value. + * @details Null is returned if the queue is empty. + * @return The value of the transfered element. null if queue is empty or thread was interrupted. + * @note Suggested return identifier: val. + */ + public String nonBlockingDequeue(JedisCluster jedis) throws InterruptedException { + QueueInterface queue = queues.get(roundRobinPopIndex()); + return queue.nonBlockingDequeue(jedis); + } + /** * @brief Get the current pop queue. * @details Get the queue that the balanced queue intends to pop from next. diff --git a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java index 64f1f2befa..78dd77c769 100644 --- a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java +++ b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java @@ -24,6 +24,7 @@ import com.google.common.collect.SetMultimap; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import redis.clients.jedis.JedisCluster; /** @@ -48,6 +49,14 @@ public class OperationQueue { */ private final List queues; + /** + * @field currentDequeueIndex + * @brief The current queue index to dequeue from. + * @details Used in a round-robin fashion to ensure an even distribution of dequeues across + * matched queues. + */ + private int currentDequeueIndex = 0; + /** * @brief Constructor. * @details Construct the operation queue with various provisioned redis queues. @@ -186,8 +195,18 @@ public void push( */ public String dequeue(JedisCluster jedis, List provisions) throws InterruptedException { - BalancedRedisQueue queue = chooseEligibleQueue(provisions); - return queue.dequeue(jedis); + // Select all matched queues, and attempt dequeuing via round-robin. + List queues = chooseEligibleQueues(provisions); + int index = roundRobinPopIndex(queues); + String value = queues.get(index).nonBlockingDequeue(jedis); + + // Keep iterating over matched queues until we find one that is non-empty and provides a + // dequeued value. + while (value == null) { + index = roundRobinPopIndex(queues); + value = queues.get(index).nonBlockingDequeue(jedis); + } + return value; } /** @@ -270,6 +289,39 @@ private BalancedRedisQueue chooseEligibleQueue(List provision } } + throwNoEligibleQueueException(provisions); + return null; + } + + /** + * @brief Choose an eligible queues based on given properties. + * @details We use the platform execution properties of a queue entry to determine the appropriate + * queues. If there no eligible queues, an exception is thrown. + * @param provisions Provisions to check that requirements are met. + * @return The chosen queues. + * @note Suggested return identifier: queues. + */ + private List chooseEligibleQueues(List provisions) { + List eligibleQueues = + queues.stream() + .filter(provisionedQueue -> provisionedQueue.isEligible(toMultimap(provisions))) + .map(provisionedQueue -> provisionedQueue.queue()) + .collect(Collectors.toList()); + + if (eligibleQueues.isEmpty()) { + throwNoEligibleQueueException(provisions); + } + + return eligibleQueues; + } + + /** + * @brief Throw an exception that explains why there are no eligible queues. + * @details This function should only be called, when there were no matched queues. + * @param provisions Provisions to check that requirements are met. + * @return no return. + */ + private void throwNoEligibleQueueException(List provisions) { // At this point, we were unable to match an action to an eligible queue. // We will build an error explaining why the matching failed. This will help user's properly // configure their queue or adjust the execution_properties of their actions. @@ -286,6 +338,34 @@ private BalancedRedisQueue chooseEligibleQueue(List provision + eligibilityResults); } + /** + * @brief Get the current queue index for round-robin dequeues. + * @details Adjusts the round-robin index for next call. + * @param matchedQueues The queues to round robin. + * @return The current round-robin index. + * @note Suggested return identifier: queueIndex. + */ + private int roundRobinPopIndex(List matchedQueues) { + int currentIndex = currentDequeueIndex; + currentDequeueIndex = nextQueueInRoundRobin(currentDequeueIndex, matchedQueues); + return currentIndex; + } + + /** + * @brief Get the next queue in the round robin. + * @details If we are currently on the last queue it becomes the first queue. + * @param index Current queue index. + * @param matchedQueues The queues to round robin. + * @return And adjusted val based on the current queue index. + * @note Suggested return identifier: adjustedCurrentQueue. + */ + private int nextQueueInRoundRobin(int index, List matchedQueues) { + if (index >= matchedQueues.size() - 1) { + return 0; + } + return index + 1; + } + /** * @brief Convert proto provisions into java multimap. * @details This conversion is done to more easily check if a key/value exists in the provisions. diff --git a/src/main/java/build/buildfarm/worker/BUILD b/src/main/java/build/buildfarm/worker/BUILD index c25b9e74d6..e24b50d731 100644 --- a/src/main/java/build/buildfarm/worker/BUILD +++ b/src/main/java/build/buildfarm/worker/BUILD @@ -36,6 +36,7 @@ java_library( "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_compress", + "@maven//:org_apache_commons_commons_lang3", "@maven//:org_jetbrains_annotations", "@maven//:org_projectlombok_lombok", ], diff --git a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java index c3c4c1dfc9..3536ed6bac 100644 --- a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java +++ b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java @@ -14,11 +14,12 @@ package build.buildfarm.worker; -import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.ExecutionProperties; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.collect.Iterables; import com.google.common.collect.SetMultimap; import org.jetbrains.annotations.NotNull; @@ -45,6 +46,7 @@ public class DequeueMatchEvaluator { * @brief Decide whether the worker should keep the operation or put it back on the queue. * @details Compares the platform properties of the worker to the operation's platform properties. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param queueEntry An entry recently removed from the queue. * @return Whether or not the worker should accept or reject the queue entry. * @note Overloaded. @@ -53,24 +55,10 @@ public class DequeueMatchEvaluator { @SuppressWarnings("NullableProblems") @NotNull public static boolean shouldKeepOperation( - SetMultimap workerProvisions, QueueEntry queueEntry) { - return shouldKeepViaPlatform(workerProvisions, queueEntry.getPlatform()); - } - - /** - * @brief Decide whether the worker should keep the operation or put it back on the queue. - * @details Compares the platform properties of the worker to the operation's platform properties. - * @param workerProvisions The provisions of the worker. - * @param command A command to evaluate. - * @return Whether or not the worker should accept or reject the queue entry. - * @note Overloaded. - * @note Suggested return identifier: shouldKeepOperation. - */ - @SuppressWarnings("NullableProblems") - @NotNull - public static boolean shouldKeepOperation( - SetMultimap workerProvisions, Command command) { - return shouldKeepViaPlatform(workerProvisions, command.getPlatform()); + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + QueueEntry queueEntry) { + return shouldKeepViaPlatform(workerProvisions, resourceSet, queueEntry.getPlatform()); } /** @@ -79,6 +67,7 @@ public static boolean shouldKeepOperation( * @details Compares the platform properties of the worker to the platform properties of the * operation. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param platform The platforms of operation. * @return Whether or not the worker should accept or reject the operation. * @note Suggested return identifier: shouldKeepOperation. @@ -86,9 +75,15 @@ public static boolean shouldKeepOperation( @SuppressWarnings("NullableProblems") @NotNull private static boolean shouldKeepViaPlatform( - SetMultimap workerProvisions, Platform platform) { - // attempt to execute everything the worker gets off the queue. + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + Platform platform) { + // attempt to execute everything the worker gets off the queue, + // provided there is enough resources to do so. // this is a recommended configuration. + if (!LocalResourceSetUtils.claimResources(platform, resourceSet)) { + return false; + } if (configs.getWorker().getDequeueMatchSettings().isAcceptEverything()) { return true; } diff --git a/src/main/java/build/buildfarm/worker/Executor.java b/src/main/java/build/buildfarm/worker/Executor.java index 0416c2354d..f4776a6f00 100644 --- a/src/main/java/build/buildfarm/worker/Executor.java +++ b/src/main/java/build/buildfarm/worker/Executor.java @@ -359,6 +359,9 @@ public void run(ResourceLimits limits) { } finally { boolean wasInterrupted = Thread.interrupted(); try { + // Now that the execution has finished we can return any of the claims against local + // resources. + workerContext.returnLocalResources(operationContext.queueEntry); owner.releaseExecutor( operationName, limits.cpu.claimed, diff --git a/src/main/java/build/buildfarm/worker/WorkerContext.java b/src/main/java/build/buildfarm/worker/WorkerContext.java index 873ad1b938..70060acea1 100644 --- a/src/main/java/build/buildfarm/worker/WorkerContext.java +++ b/src/main/java/build/buildfarm/worker/WorkerContext.java @@ -130,4 +130,6 @@ IOResource limitExecution( int commandExecutionClaims(Command command); ResourceLimits commandExecutionSettings(Command command); + + void returnLocalResources(QueueEntry queueEntry); } diff --git a/src/main/java/build/buildfarm/worker/resources/BUILD b/src/main/java/build/buildfarm/worker/resources/BUILD index ac2d69179b..36a4dc3d34 100644 --- a/src/main/java/build/buildfarm/worker/resources/BUILD +++ b/src/main/java/build/buildfarm/worker/resources/BUILD @@ -8,6 +8,7 @@ java_library( "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:com_google_guava_guava", "@maven//:com_googlecode_json_simple_json_simple", + "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_lang3", ], ) diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java new file mode 100644 index 0000000000..97d64fe9b1 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java @@ -0,0 +1,37 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Semaphore; + +/** + * @class Local Resource Set + * @brief A fixed amount of a specific resource. + * @details We define limited resources as a counting semaphores whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. These + * resources are specific to the individual worker. + */ +public class LocalResourceSet { + /** + * @field resources + * @brief A set containing resource semaphores organized by name. + * @details Key is name, and value is current usage amount. + */ + public Map resources = new HashMap<>(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java new file mode 100644 index 0000000000..787a7840e5 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java @@ -0,0 +1,46 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import io.prometheus.client.Gauge; + +/** + * @class LocalResourceSetMetrics + * @brief Tracks metrics related to a worker's limited local resources. + * @details Answers how many resources exist, how many are claimed, and by how many requesters. + */ +public class LocalResourceSetMetrics { + public static final Gauge resourceUsageMetric = + Gauge.build() + .name("local_resource_usage") + .labelNames("resource_name") + .help("The number of claims for each resource currently being used for execution") + .register(); + + public static final Gauge resourceTotalMetric = + Gauge.build() + .name("local_resource_total") + .labelNames("resource_name") + .help("The total number of claims exist for a particular resource") + .register(); + + public static final Gauge requestersMetric = + Gauge.build() + .name("local_resource_requesters") + .labelNames("resource_name") + .help( + "Tracks how many actions have requested local resources. This can help determine if resources are being hogged by some actions.") + .register(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java new file mode 100644 index 0000000000..e8a1e56131 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java @@ -0,0 +1,120 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import build.bazel.remote.execution.v2.Platform; +import build.buildfarm.common.config.LimitedResource; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Semaphore; +import org.apache.commons.lang3.StringUtils; + +/** + * @class LocalResourceSetUtils + * @brief Utilities for working with the worker's set of local limited resources. + * @details The methods help with allocation / de-allocation of claims, as well as metrics printing. + */ +public class LocalResourceSetUtils { + private static final LocalResourceSetMetrics metrics = new LocalResourceSetMetrics(); + + public static LocalResourceSet create(List resources) { + LocalResourceSet resourceSet = new LocalResourceSet(); + for (LimitedResource resource : resources) { + resourceSet.resources.put(resource.getName(), new Semaphore(resource.getAmount())); + metrics.resourceTotalMetric.labels(resource.getName()).set(resource.getAmount()); + } + return resourceSet; + } + + public static boolean claimResources(Platform platform, LocalResourceSet resourceSet) { + List> claimed = new ArrayList<>(); + + boolean allClaimed = true; + for (Platform.Property property : platform.getPropertiesList()) { + // Skip properties that are not requesting a limited resource. + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + + // Attempt to claim. If claiming fails, we must return all other claims. + int requestAmount = getResourceRequestAmount(property); + boolean wasAcquired = semaphoreAquire(resource, resourceName, requestAmount); + if (wasAcquired) { + claimed.add(new AbstractMap.SimpleEntry<>(resourceName, requestAmount)); + } else { + allClaimed = false; + break; + } + } + + // cleanup remaining resources if they were not all claimed. + if (!allClaimed) { + for (Map.Entry claim : claimed) { + semaphoreRelease( + resourceSet.resources.get(claim.getKey()), claim.getKey(), claim.getValue()); + } + } + + return allClaimed; + } + + public static void releaseClaims(Platform platform, LocalResourceSet resourceSet) { + for (Platform.Property property : platform.getPropertiesList()) { + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + int requestAmount = getResourceRequestAmount(property); + semaphoreRelease(resource, resourceName, requestAmount); + } + } + + private static boolean semaphoreAquire(Semaphore resource, String resourceName, int amount) { + boolean wasAcquired = resource.tryAcquire(amount); + if (wasAcquired) { + metrics.resourceUsageMetric.labels(resourceName).inc(amount); + } + metrics.requestersMetric.labels(resourceName).inc(); + return wasAcquired; + } + + private static void semaphoreRelease(Semaphore resource, String resourceName, int amount) { + resource.release(amount); + metrics.resourceUsageMetric.labels(resourceName).dec(amount); + metrics.requestersMetric.labels(resourceName).dec(); + } + + private static int getResourceRequestAmount(Platform.Property property) { + // We support resource values that are not numbers and interpret them as a request for 1 + // resource. For example "gpu:RTX-4090" is equivalent to resource:gpu:1". + try { + return Integer.parseInt(property.getValue()); + } catch (NumberFormatException e) { + return 1; + } + } + + private static String getResourceName(Platform.Property property) { + // We match to keys whether they are prefixed "resource:" or not. + // "resource:gpu:1" requests the gpu resource in the same way that "gpu:1" does. + // The prefix originates from bazel's syntax for the --extra_resources flag. + return StringUtils.removeStart(property.getName(), "resource:"); + } +} diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index d32731879c..c844f53963 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -60,6 +60,8 @@ import build.buildfarm.worker.cgroup.Cpu; import build.buildfarm.worker.cgroup.Group; import build.buildfarm.worker.cgroup.Mem; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import build.buildfarm.worker.resources.ResourceDecider; import build.buildfarm.worker.resources.ResourceLimits; import com.google.common.annotations.VisibleForTesting; @@ -129,6 +131,7 @@ class ShardWorkerContext implements WorkerContext { private final Group operationsGroup = executionsGroup.getChild("operations"); private final CasWriter writer; private final boolean errorOperationRemainingResources; + private final LocalResourceSet resourceSet; static SetMultimap getMatchProvisions( Iterable policies, int executeStageWidth) { @@ -162,6 +165,7 @@ static SetMultimap getMatchProvisions( boolean onlyMulticoreTests, boolean allowBringYourOwnContainer, boolean errorOperationRemainingResources, + LocalResourceSet resourceSet, CasWriter writer) { this.name = name; this.matchProvisions = getMatchProvisions(policies, executeStageWidth); @@ -182,6 +186,7 @@ static SetMultimap getMatchProvisions( this.onlyMulticoreTests = onlyMulticoreTests; this.allowBringYourOwnContainer = allowBringYourOwnContainer; this.errorOperationRemainingResources = errorOperationRemainingResources; + this.resourceSet = resourceSet; this.writer = writer; } @@ -273,6 +278,12 @@ public QueuedOperation getQueuedOperation(QueueEntry queueEntry) @SuppressWarnings("ConstantConditions") private void matchInterruptible(MatchListener listener) throws IOException, InterruptedException { + QueueEntry queueEntry = takeEntryOffOperationQueue(listener); + decideWhetherToKeepOperation(queueEntry, listener); + } + + private QueueEntry takeEntryOffOperationQueue(MatchListener listener) + throws IOException, InterruptedException { listener.onWaitStart(); QueueEntry queueEntry = null; try { @@ -294,9 +305,13 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte // transient backplane errors will propagate a null queueEntry } listener.onWaitEnd(); + return queueEntry; + } + private void decideWhetherToKeepOperation(QueueEntry queueEntry, MatchListener listener) + throws IOException, InterruptedException { if (queueEntry == null - || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, queueEntry)) { + || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, resourceSet, queueEntry)) { listener.onEntry(queueEntry); } else { backplane.rejectOperation(queueEntry); @@ -306,6 +321,11 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte } } + @Override + public void returnLocalResources(QueueEntry queueEntry) { + LocalResourceSetUtils.releaseClaims(queueEntry.getPlatform(), resourceSet); + } + @Override public void match(MatchListener listener) throws InterruptedException { RetryingMatchListener dedupMatchListener = diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 153df1a371..7a0d4a60ed 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -59,6 +59,7 @@ import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.ReportResultStage; import build.buildfarm.worker.SuperscalarPipelineStage; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import com.google.longrunning.Operation; @@ -562,6 +563,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep configs.getWorker().isOnlyMulticoreTests(), configs.getWorker().isAllowBringYourOwnContainer(), configs.getWorker().isErrorOperationRemainingResources(), + LocalResourceSetUtils.create(configs.getWorker().getResources()), writer); pipeline = new Pipeline(); diff --git a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java index 5301b82608..547f81b2ca 100644 --- a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java +++ b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java @@ -30,7 +30,6 @@ public void testHandleGrpcMetricIntercepts_withLatencyBucket() { grpcMetrics.setEnabled(true); grpcMetrics.setProvideLatencyHistograms(true); grpcMetrics.setLatencyBuckets(new double[] {1, 2, 3}); - GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, grpcMetrics); verify(serverBuilder, times(1)).intercept(any(MonitoringServerInterceptor.class)); } diff --git a/src/test/java/build/buildfarm/instance/shard/BUILD b/src/test/java/build/buildfarm/instance/shard/BUILD index 6a501a1b68..cd0ae18d24 100644 --- a/src/test/java/build/buildfarm/instance/shard/BUILD +++ b/src/test/java/build/buildfarm/instance/shard/BUILD @@ -1,7 +1,195 @@ java_test( - name = "tests", + name = "DispatchedMonitorTest", size = "small", - srcs = glob(["*.java"]), + srcs = [ + "DispatchedMonitorTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardBackplaneTest", + size = "small", + srcs = [ + "RedisShardBackplaneTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardSubscriberTest", + size = "small", + srcs = [ + "RedisShardSubscriberTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "ShardInstanceTest", + size = "small", + srcs = [ + "ShardInstanceTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "TimedWatcherTest", + size = "small", + srcs = [ + "TimedWatcherTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "UtilTest", + size = "small", + srcs = [ + "UnobservableWatcher.java", + "UtilTest.java", + ], data = ["//examples:example_configs"], test_class = "build.buildfarm.AllTests", deps = [ diff --git a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java index 16e5bb8fbc..3772459cf7 100644 --- a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java +++ b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java @@ -213,8 +213,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) @@ -266,8 +265,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) diff --git a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java b/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java index 1fdb7fc460..aa1adc5d3d 100644 --- a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java @@ -1052,7 +1052,6 @@ public void containsBlobReflectsWorkerWithUnknownSize() throws Exception { @Test public void findMissingBlobsTest_ViaBackPlane() throws Exception { - Set activeWorkers = ImmutableSet.of("worker1", "worker2", "worker3"); Set expiredWorkers = ImmutableSet.of("workerX", "workerY", "workerZ"); Set imposterWorkers = ImmutableSet.of("imposter1", "imposter2", "imposter3"); diff --git a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java index bc7acd00a4..47f25cdf79 100644 --- a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java +++ b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java @@ -19,8 +19,10 @@ import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.HashMultimap; import com.google.common.collect.SetMultimap; +import java.util.concurrent.Semaphore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -52,10 +54,12 @@ public class DequeueMatchEvaluatorTest { public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder().setPlatform(Platform.newBuilder()).build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -70,6 +74,7 @@ public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "11"); QueueEntry entry = @@ -81,7 +86,8 @@ public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has more cores than the min-cores requested @@ -98,6 +104,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { // ARRANGE configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -109,7 +116,8 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker rejects because it has less cores than the min-cores requested @@ -123,6 +131,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -136,7 +145,8 @@ public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Excepti .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has the same cores as the min-cores requested @@ -153,6 +163,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder() @@ -163,7 +174,8 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isFalse(); @@ -172,7 +184,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -181,9 +193,151 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resource should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(1)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsMultipleResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(2)); + resourceSet.resources.put("BAR", new Semaphore(4)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("2"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(2); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(4); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(2); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should fail to claim, and the existing amount should be + // the same. + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationFailsToClaimSameAmountRemains() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(50)); + resourceSet.resources.put("BAR", new Semaphore(100)); + resourceSet.resources.put("BAZ", new Semaphore(200)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("20")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("101")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAZ").setValue("20"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + // The same amount are returned. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + } } diff --git a/src/test/java/build/buildfarm/worker/StubWorkerContext.java b/src/test/java/build/buildfarm/worker/StubWorkerContext.java index d288d67a6f..7ccf1182d3 100644 --- a/src/test/java/build/buildfarm/worker/StubWorkerContext.java +++ b/src/test/java/build/buildfarm/worker/StubWorkerContext.java @@ -222,4 +222,9 @@ public ResourceLimits commandExecutionSettings(Command command) { public boolean shouldErrorOperationOnRemainingResources() { throw new UnsupportedOperationException(); } + + @Override + public void returnLocalResources(QueueEntry queueEntry) { + throw new UnsupportedOperationException(); + } } diff --git a/src/test/java/build/buildfarm/worker/shard/BUILD b/src/test/java/build/buildfarm/worker/shard/BUILD index b8a7b31ec6..c254e41fa4 100644 --- a/src/test/java/build/buildfarm/worker/shard/BUILD +++ b/src/test/java/build/buildfarm/worker/shard/BUILD @@ -11,6 +11,7 @@ java_test( "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/worker", + "//src/main/java/build/buildfarm/worker/resources", "//src/main/java/build/buildfarm/worker/shard", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java index dd584c49af..c24e681abb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java +++ b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java @@ -35,6 +35,7 @@ import build.buildfarm.instance.MatchListener; import build.buildfarm.v1test.QueueEntry; import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.ImmutableList; import com.google.protobuf.Duration; import java.util.ArrayList; @@ -105,6 +106,7 @@ WorkerContext createTestContext(Iterable policies) { /* onlyMulticoreTests=*/ false, /* allowBringYourOwnContainer=*/ false, /* errorOperationRemainingResources=*/ false, + /* resourceSet=*/ new LocalResourceSet(), writer); } From 97e3b90c263a27b6e08713d2455a590f7f47405c Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:34:32 -0700 Subject: [PATCH 39/58] build: override grpc dependencies with our dependencies Don't get transitive grpc dependencies, use the ones from our `maven_install(...)` --- defs.bzl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/defs.bzl b/defs.bzl index ec51856484..869fdd2da7 100644 --- a/defs.bzl +++ b/defs.bzl @@ -8,7 +8,7 @@ load( "@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories", ) -load("@io_grpc_grpc_java//:repositories.bzl", "grpc_java_repositories") +load("@io_grpc_grpc_java//:repositories.bzl", "grpc_java_repositories", "IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS") load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") load("@com_grail_bazel_toolchain//toolchain:rules.bzl", "llvm_toolchain") load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories") @@ -114,6 +114,7 @@ def buildfarm_init(name = "buildfarm"): "org.projectlombok:lombok:1.18.24", ], generate_compat_repositories = True, + override_targets = IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS, repositories = [ "https://repo1.maven.org/maven2", "https://mirrors.ibiblio.org/pub/mirrors/maven2", From 96f239d5131b17c32d07a46bb583b487472e5d1c Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:35:25 -0700 Subject: [PATCH 40/58] chore(deps): bump protobuf runtime to 3.19.1 --- defs.bzl | 4 ++-- deps.bzl | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/defs.bzl b/defs.bzl index 869fdd2da7..ad8d07ced8 100644 --- a/defs.bzl +++ b/defs.bzl @@ -79,8 +79,8 @@ def buildfarm_init(name = "buildfarm"): "com.google.guava:guava:32.1.1-jre", "com.google.j2objc:j2objc-annotations:1.1", "com.google.jimfs:jimfs:1.1", - "com.google.protobuf:protobuf-java-util:3.10.0", - "com.google.protobuf:protobuf-java:3.10.0", + "com.google.protobuf:protobuf-java-util:3.19.1", + "com.google.protobuf:protobuf-java:3.19.1", "com.google.truth:truth:0.44", "org.slf4j:slf4j-simple:1.7.35", "com.googlecode.json-simple:json-simple:1.1.1", diff --git a/deps.bzl b/deps.bzl index f802fb27e7..e2d2609acf 100644 --- a/deps.bzl +++ b/deps.bzl @@ -36,9 +36,9 @@ def archive_dependencies(third_party): # Needed for "well-known protos" and @com_google_protobuf//:protoc. { "name": "com_google_protobuf", - "sha256": "dd513a79c7d7e45cbaeaf7655289f78fd6b806e52dbbd7018ef4e3cf5cff697a", - "strip_prefix": "protobuf-3.15.8", - "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.15.8.zip"], + "sha256": "25f1292d4ea6666f460a2a30038eef121e6c3937ae0f61d610611dfb14b0bd32", + "strip_prefix": "protobuf-3.19.1", + "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.19.1.zip"], }, { "name": "com_github_bazelbuild_buildtools", From af3f34e143c263f10139a9eee6262fb190bdddb9 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:36:21 -0700 Subject: [PATCH 41/58] chore(deps) add transitive dependencies --- src/main/java/build/buildfarm/server/BUILD | 2 ++ src/main/java/build/buildfarm/worker/shard/BUILD | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index eccbf4fda2..7244807f8a 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -16,6 +16,8 @@ java_library( "//src/main/java/build/buildfarm/metrics/prometheus", "//src/main/java/build/buildfarm/server/services", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "@io_grpc_grpc_proto//:health_proto", + "@io_grpc_grpc_proto//:health_java_proto", "@maven//:com_github_pcj_google_options", "@maven//:com_google_guava_guava", "@maven//:io_grpc_grpc_api", diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index ec48ec5be0..4458d61821 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -20,6 +20,8 @@ java_library( "//src/main/java/build/buildfarm/worker/resources", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@googleapis//:google_rpc_error_details_java_proto", "@maven//:com_github_ben_manes_caffeine_caffeine", "@maven//:com_github_pcj_google_options", From 380f8a1bec163adffc7e0c883abf7f5a160a80ac Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:36:33 -0700 Subject: [PATCH 42/58] feat: add Proto reflection service to shard worker To aid connection troubleshooting --- src/main/java/build/buildfarm/worker/shard/Worker.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 7a0d4a60ed..038097de7f 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -70,6 +70,7 @@ import io.grpc.Status; import io.grpc.Status.Code; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; +import io.grpc.protobuf.services.ProtoReflectionService; import io.grpc.services.HealthStatusManager; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; @@ -158,6 +159,7 @@ private Server createServer( serverBuilder.addService(new ContentAddressableStorageService(instance)); serverBuilder.addService(new ByteStreamService(instance)); serverBuilder.addService(new ShutDownWorkerGracefully(this)); + serverBuilder.addService(ProtoReflectionService.newInstance()); // We will build a worker's server based on it's capabilities. // A worker that is capable of execution will construct an execution pipeline. From 7e7979d037294d1896ed6ae53d8a966e97ffc609 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:55:18 -0700 Subject: [PATCH 43/58] fixup! build: override grpc dependencies with our dependencies --- defs.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defs.bzl b/defs.bzl index ad8d07ced8..5b0e4f640c 100644 --- a/defs.bzl +++ b/defs.bzl @@ -8,7 +8,7 @@ load( "@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories", ) -load("@io_grpc_grpc_java//:repositories.bzl", "grpc_java_repositories", "IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS") +load("@io_grpc_grpc_java//:repositories.bzl", "IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS", "grpc_java_repositories") load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") load("@com_grail_bazel_toolchain//toolchain:rules.bzl", "llvm_toolchain") load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories") From 1f9d01fa70f1ecc419c9fca62f5602e5400d66c6 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Mon, 16 Oct 2023 11:55:29 -0700 Subject: [PATCH 44/58] fixup! chore(deps) add transitive dependencies --- src/main/java/build/buildfarm/server/BUILD | 2 +- src/main/java/build/buildfarm/worker/shard/BUILD | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index 7244807f8a..2cd750a4ff 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -16,8 +16,8 @@ java_library( "//src/main/java/build/buildfarm/metrics/prometheus", "//src/main/java/build/buildfarm/server/services", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@io_grpc_grpc_proto//:health_proto", "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@maven//:com_github_pcj_google_options", "@maven//:com_google_guava_guava", "@maven//:io_grpc_grpc_api", diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index 4458d61821..b62e4369de 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -20,9 +20,9 @@ java_library( "//src/main/java/build/buildfarm/worker/resources", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", "@io_grpc_grpc_proto//:health_java_proto", "@io_grpc_grpc_proto//:health_proto", - "@googleapis//:google_rpc_error_details_java_proto", "@maven//:com_github_ben_manes_caffeine_caffeine", "@maven//:com_github_pcj_google_options", "@maven//:com_google_code_findbugs_jsr305", From 578589fe6dfe075bbb92eb68b4235ed7a335afb0 Mon Sep 17 00:00:00 2001 From: amishra-u <119983081+amishra-u@users.noreply.github.com> Date: Tue, 24 Oct 2023 11:00:14 -0700 Subject: [PATCH 45/58] Bug: Fix Blocked thread in WriteStreamObserver Caused by CASFile Write (#1486) * Add unit test * Signal Write on complete --- .../build/buildfarm/cas/cfc/CASFileCache.java | 3 + .../buildfarm/cas/cfc/CASFileCacheTest.java | 113 ++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index aac1b28994..21c90e5fee 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -878,6 +878,9 @@ public synchronized void reset() { + key.getIdentifier(), e); } finally { + if (closedFuture != null) { + closedFuture.set(null); + } isReset = true; } } diff --git a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java index a3d406e232..caaab02b35 100644 --- a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java @@ -19,8 +19,11 @@ import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; +import static java.lang.Thread.State.TERMINATED; +import static java.lang.Thread.State.WAITING; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.junit.Assert.fail; import static org.mockito.Mockito.any; @@ -58,6 +61,7 @@ import com.google.common.collect.Maps; import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; +import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; @@ -77,6 +81,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -1207,6 +1212,114 @@ protected InputStream newExternalInput( assertThat(expected).isNotNull(); } + @Test + public void testConcurrentWrites() throws Exception { + ByteString blob = ByteString.copyFromUtf8("concurrent write"); + Digest digest = DIGEST_UTIL.compute(blob); + UUID uuid = UUID.randomUUID(); + // The same instance of Write will be passed to both the threads, so that the both threads + // try to get same output stream. + Write write = + fileCache.getWrite( + Compressor.Value.IDENTITY, digest, uuid, RequestMetadata.getDefaultInstance()); + + CyclicBarrier barrier = new CyclicBarrier(3); + + Thread write1 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + barrier.await(); // let both the threads get same write stream. + writeStreamObserver.ownStream(); // let other thread get the ownership of stream + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "FirstRequest"); + Thread write2 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + writeStreamObserver.ownStream(); // this thread will get the ownership of stream + barrier.await(); // let both the threads get same write stream. + while (write1.getState() != WAITING) ; // wait for first request to go in wait state + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "SecondRequest"); + write1.start(); + write2.start(); + barrier.await(); // let both the requests reach the critical section + + // Wait for each write operation to complete, allowing a maximum of 100ms per write. + // Note: A 100ms wait time allowed 1000 * 8 successful test runs. + // In certain scenario, even this wait time may not be enough and test still be called flaky. + // But setting wait time 0 may cause test to wait forever (if there is issue in code) and the + // build might fail with timeout error. + write1.join(100); + write2.join(100); + + assertThat(write1.getState()).isEqualTo(TERMINATED); + assertThat(write2.getState()).isEqualTo(TERMINATED); + } + + static class ConcurrentWriteStreamObserver { + Write write; + FeedbackOutputStream out; + + ConcurrentWriteStreamObserver(Write write) { + this.write = write; + } + + void registerCallback() { + Futures.addCallback( + write.getFuture(), + new FutureCallback() { + @Override + public void onSuccess(Long committedSize) { + commit(); + } + + @Override + public void onFailure(Throwable t) { + // do nothing + } + }, + directExecutor()); + } + + synchronized void ownStream() throws Exception { + this.out = write.getOutput(10, MILLISECONDS, () -> {}); + } + /** + * Request 1 may invoke this method for request 2 or vice-versa via callback on + * write.getFuture(). Synchronization is necessary to prevent conflicts when this method is + * called simultaneously by different threads. + */ + synchronized void commit() { + // critical section + } + + void write(ByteString data) throws IOException { + data.writeTo(out); + } + + void close() throws IOException { + out.close(); + } + } + @RunWith(JUnit4.class) public static class NativeFileDirsIndexInMemoryCASFileCacheTest extends CASFileCacheTest { public NativeFileDirsIndexInMemoryCASFileCacheTest() throws IOException { From dfa5937090d36e7d39fae577c978412367b71c6c Mon Sep 17 00:00:00 2001 From: Stefano Baghino Date: Tue, 24 Oct 2023 21:38:09 +0200 Subject: [PATCH 46/58] Pin the Java toolchain to `remotejdk_17` (#1509) Closes #1508 Cleanups: - remove the unused `ubuntu-bionic` base image - replace `ubuntu-jammy:jammy-java11-gcc` with `ubuntu-mantic:mantic-java17-gcc` - replace `amazoncorretto:19` with `ubuntu-mantic:mantic-java17-gcc` - swap inverted log file names in a file --- .bazelci/run_server_test.sh | 4 ++-- .bazelrc | 6 +++++ BUILD | 6 ++--- images.bzl | 22 +++---------------- jvm_flags.bzl | 10 +++++++-- .../common/processes/JavaProcessWrapper.java | 6 ++++- .../bazel/processes/PersistentWorkerTest.java | 3 ++- src/main/java/build/buildfarm/BUILD | 6 ++--- src/test/java/build/buildfarm/cas/BUILD | 4 ++-- src/test/java/build/buildfarm/common/BUILD | 4 ++-- 10 files changed, 36 insertions(+), 35 deletions(-) diff --git a/.bazelci/run_server_test.sh b/.bazelci/run_server_test.sh index 21b6d70389..54e4658b16 100755 --- a/.bazelci/run_server_test.sh +++ b/.bazelci/run_server_test.sh @@ -8,11 +8,11 @@ bazel build //src/main/java/build/buildfarm:buildfarm-shard-worker bazel build //src/main/java/build/buildfarm:buildfarm-server # Start a single worker -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > server.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & echo "Started buildfarm-shard-worker..." # Start a single server -bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > server.log 2>&1 & echo "Started buildfarm-server..." echo "Wait for startup to finish..." diff --git a/.bazelrc b/.bazelrc index adbb3c68fd..c3caf4d8fb 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,3 +1,9 @@ +build --java_language_version=17 +build --java_runtime_version=remotejdk_17 + +build --tool_java_language_version=17 +build --tool_java_runtime_version=remotejdk_17 + common --enable_platform_specific_config build:fuse --define=fuse=true diff --git a/BUILD b/BUILD index f552ea9d7b..eb773e1a16 100644 --- a/BUILD +++ b/BUILD @@ -120,7 +120,7 @@ sh_binary( java_image( name = "buildfarm-server", args = ["/app/build_buildfarm/examples/config.minimal.yml"], - base = "@amazon_corretto_java_image_base//image", + base = "@ubuntu-mantic//image", classpath_resources = [ "//src/main/java/build/buildfarm:configs", ], @@ -148,14 +148,14 @@ oss_audit( # Download cgroup-tools so that the worker is able to restrict actions via control groups. download_pkgs( name = "worker_pkgs", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", packages = ["cgroup-tools"], tags = ["container"], ) install_pkgs( name = "worker_pkgs_image", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", installables_tar = ":worker_pkgs.tar", installation_cleanup_commands = "rm -rf /var/lib/apt/lists/*", output_image_name = "worker_pkgs_image", diff --git a/images.bzl b/images.bzl index 9ab0a8b0b7..939a752ed5 100644 --- a/images.bzl +++ b/images.bzl @@ -27,25 +27,9 @@ def buildfarm_images(): ) container_pull( - name = "ubuntu-bionic", - digest = "sha256:4bc527c7a288da405f2041928c63d0a6479a120ad63461c2f124c944def54be2", + name = "ubuntu-mantic", + digest = "sha256:1419bba15470a95242e917b3abcd8981ae36707b99df5ac705e1eee4d920c51c", registry = "index.docker.io", repository = "bazelbuild/buildfarm-worker-base", - tag = "bionic-java11-gcc", - ) - - container_pull( - name = "ubuntu-jammy", - digest = "sha256:da847ee259ebe7f00631a2f0146d9add60ff0f94b031a2e522ce94c78b1335c2", - registry = "index.docker.io", - repository = "bazelbuild/buildfarm-worker-base", - tag = "jammy-java11-gcc", - ) - - container_pull( - name = "amazon_corretto_java_image_base", - registry = "index.docker.io", - repository = "amazoncorretto", - tag = "19", - digest = "sha256:81d0df4412140416b27211c999e1f3c4565ae89a5cd92889475d20af422ba507", + tag = "mantic-java17-gcc", ) diff --git a/jvm_flags.bzl b/jvm_flags.bzl index 363f161465..440e0718aa 100644 --- a/jvm_flags.bzl +++ b/jvm_flags.bzl @@ -54,6 +54,12 @@ def ensure_accurate_metadata(): "//config:windows": ["-Dsun.nio.fs.ensureAccurateMetadata=true"], }) +def add_opens_sun_nio_fs(): + return select({ + "//conditions:default": [], + "//config:windows": ["--add-opens java.base/sun.nio.fs=ALL-UNNAMED"], + }) + def server_telemetry(): return select({ "//config:open_telemetry": SERVER_TELEMETRY_JVM_FLAGS, @@ -67,7 +73,7 @@ def worker_telemetry(): }) def server_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + server_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + server_telemetry() def worker_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + worker_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + worker_telemetry() diff --git a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java index a27b6a9e99..89f2e6a5be 100644 --- a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java +++ b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java @@ -10,12 +10,16 @@ public class JavaProcessWrapper extends ProcessWrapper { + // Get the path of the JVM from the current process to avoid breaking the Bazel sandbox + public static final String CURRENT_JVM_COMMAND = + ProcessHandle.current().info().command().orElseThrow(() -> new RuntimeException("Unable to retrieve the path of the running JVM")); + public JavaProcessWrapper( Path workDir, String classPath, String fullClassName, String[] args ) throws IOException { super(workDir, cmdArgs( new String[]{ - "java", + CURRENT_JVM_COMMAND, "-cp", classPath, fullClassName diff --git a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java index 0cdc68a7ff..9712394203 100644 --- a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java +++ b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java @@ -16,6 +16,7 @@ import persistent.bazel.client.PersistentWorker; import persistent.bazel.client.WorkerKey; +import persistent.common.processes.JavaProcessWrapper; import persistent.testutil.ProcessUtils; import persistent.testutil.WorkerUtils; @@ -55,7 +56,7 @@ public void endToEndAdder() throws Exception { ); ImmutableList initCmd = ImmutableList.of( - "java", + JavaProcessWrapper.CURRENT_JVM_COMMAND, "-cp", jarPath.toString(), "adder.Adder", diff --git a/src/main/java/build/buildfarm/BUILD b/src/main/java/build/buildfarm/BUILD index 601aa38eb4..3cbdeb5231 100644 --- a/src/main/java/build/buildfarm/BUILD +++ b/src/main/java/build/buildfarm/BUILD @@ -1,4 +1,4 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") package( default_visibility = ["//src:__subpackages__"], @@ -15,7 +15,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.server.BuildFarmServer", visibility = ["//visibility:public"], runtime_deps = [ @@ -29,7 +29,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.worker.shard.Worker", visibility = ["//visibility:public"], runtime_deps = [ diff --git a/src/test/java/build/buildfarm/cas/BUILD b/src/test/java/build/buildfarm/cas/BUILD index ff7774e067..fc127b1ea6 100644 --- a/src/test/java/build/buildfarm/cas/BUILD +++ b/src/test/java/build/buildfarm/cas/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["**/*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/cas", diff --git a/src/test/java/build/buildfarm/common/BUILD b/src/test/java/build/buildfarm/common/BUILD index bda4f4243f..70ff4abc94 100644 --- a/src/test/java/build/buildfarm/common/BUILD +++ b/src/test/java/build/buildfarm/common/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/common", From f6459d199d1a40bdd8b31a75d7c19da7a1af69f4 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Wed, 18 Oct 2023 13:32:18 -0700 Subject: [PATCH 47/58] docs: add markdown language specifiers for code blocks --- README.md | 28 ++++++++--------- _site/docs/configuration/configuration.md | 32 ++++++++++---------- _site/docs/execution/execution_policies.md | 26 ++++++++++------ _site/docs/execution/execution_properties.md | 25 +++++++++------ _site/docs/quick_start.md | 25 +++++++++------ 5 files changed, 77 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 675fe94dbd..cd9caf8ee3 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ All commandline options override corresponding config settings. Run via -``` -docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 +```shell +$ docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 redis-cli config set stop-writes-on-bgsave-error no ``` @@ -28,8 +28,8 @@ redis-cli config set stop-writes-on-bgsave-error no Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` @@ -40,8 +40,8 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag= Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml @@ -53,9 +53,9 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm To use the example configured buildfarm with bazel (version 1.0 or higher), you can configure your `.bazelrc` as follows: -``` +```shell $ cat .bazelrc -build --remote_executor=grpc://localhost:8980 +$ build --remote_executor=grpc://localhost:8980 ``` Then run your build as you would normally do. @@ -67,20 +67,20 @@ Buildfarm uses [Java's Logging framework](https://docs.oracle.com/javase/10/core You can use typical Java logging configuration to filter these results and observe the flow of executions through your running services. An example `logging.properties` file has been provided at [examples/logging.properties](examples/logging.properties) for use as follows: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` and -``` -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +``` shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` To attach a remote debugger, run the executable with the `--debug=` flag. For example: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml ``` diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index ae913adf88..ab3638f5b1 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -7,7 +7,7 @@ has_children: true Minimal required: -``` +```yaml backplane: redisUri: "redis://localhost:6379" queues: @@ -38,7 +38,7 @@ For an example configuration containing all of the configuration values, see `ex Example: -``` +```yaml digestFunction: SHA1 defaultActionTimeout: 1800 maximumActionTimeout: 1800 @@ -79,7 +79,7 @@ worker: Example: -``` +```yaml server: instanceType: SHARD name: shard @@ -96,7 +96,7 @@ server: Example: -``` +```yaml server: grpcMetrics: enabled: false @@ -114,7 +114,7 @@ server: Example: -``` +```yaml server: caches: directoryCacheMaxEntries: 10000 @@ -132,7 +132,7 @@ server: Example: -``` +```yaml server: admin: deploymentEnvironment: AWS @@ -151,14 +151,14 @@ server: Example: -``` +```yaml server: metrics: publisher: log logLevel: INFO ``` -``` +```yaml server: metrics: publisher: aws @@ -207,7 +207,7 @@ server: Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -224,7 +224,7 @@ backplane: Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -262,7 +262,7 @@ backplane: | realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | | gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for operations in flight to finish when shutdown signal is received | -``` +```yaml worker: port: 8981 publicName: "localhost:8981" @@ -279,7 +279,7 @@ worker: Example: -``` +```yaml worker: capabilities: cas: true @@ -296,7 +296,7 @@ worker: Example: -``` +```yaml worker: sandboxSettings: alwaysUse: true @@ -313,7 +313,7 @@ worker: Example: -``` +```yaml worker: dequeueMatchSettings: acceptEverything: true @@ -333,7 +333,7 @@ worker: Example: -``` +```yaml worker: storages: - type: FILESYSTEM @@ -361,7 +361,7 @@ worker: Example: -``` +```yaml worker: executionPolicies: - name: test diff --git a/_site/docs/execution/execution_policies.md b/_site/docs/execution/execution_policies.md index 19698a9925..f0eaf295d9 100644 --- a/_site/docs/execution/execution_policies.md +++ b/_site/docs/execution/execution_policies.md @@ -17,7 +17,7 @@ This policy type specifies that a worker should prepend a single path, and a num This example will use the buildfarm-provided executable `as-nobody`, which will upon execution demote itself to a `nobody` effective process owner uid, and perform an `execvp(2)` with the remaining provided program arguments, which will subsequently execute as a user that no longer matches the worker process. -``` +```yaml # default wrapper policy application worker: executionPolicies: @@ -50,7 +50,8 @@ These wrappers are used for detecting actions that rely on time. Below is a dem This addresses two problems in regards to an action's dependence on time. The 1st problem is when an action takes longer than it should because it's sleeping unnecessarily. The 2nd problem is when an action relies on time which causes it to eventually be broken on master despite the code not changing. Both problems are expressed below as unit tests. We demonstrate a time-spoofing mechanism (the re-writing of syscalls) which allows us to detect these problems generically over any action. The objective is to analyze builds for performance inefficiency and discover future instabilities before they occur. ### Issue 1 (slow test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -58,16 +59,19 @@ echo -n "testing... " sleep 10; echo "done" ``` + The test takes 10 seconds to run on average. -``` -bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 10.2s Stats over 10 runs: max = 10.2s, min = 10.1s, avg = 10.2s, dev = 0.0s ``` We can check for performance improvements by using the `skip-sleep` option. -``` -bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 1.0s Stats over 10 runs: max = 1.0s, min = 0.9s, avg = 1.0s, dev = 0.0s ``` @@ -75,7 +79,8 @@ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties=' Now the test is 10x faster. If skipping sleep makes an action perform significantly faster without affecting its success rate, that would warrant further investigation into the action's implementation. ### Issue 2 (future failing test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -89,12 +94,15 @@ echo "Times change." date exit -1; ``` + The test passes today, but will it pass tomorrow? Will it pass a year from now? We can find out by using the `time-shift` option. -``` -bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail + +```shell +$ bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail INFO: Found 1 test target... Times change. Mon Sep 25 18:31:09 UTC 2023 //cloud/buildfarm:future_fail FAILED in 18.0s ``` + Time is shifted to the year 2023 and the test now fails. We can fix the problem before others see it. diff --git a/_site/docs/execution/execution_properties.md b/_site/docs/execution/execution_properties.md index 85579ed099..7966c70dd2 100644 --- a/_site/docs/execution/execution_properties.md +++ b/_site/docs/execution/execution_properties.md @@ -76,37 +76,42 @@ Despite being given 1 core, they see all of the cpus and decide to spawn that ma **Standard Example:** This test will succeed when env var TESTVAR is foobar, and fail otherwise. -``` + +```shell #!/bin/bash [ "$TESTVAR" = "foobar" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ +```shell +$ ./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main PASS ``` + **Template Example:** If you give a range of cores, buildfarm has the authority to decide how many your operation actually claims. You can let buildfarm resolve this value for you (via [mustache](https://mustache.github.io/)). -``` +```bash #!/bin/bash [ "$MKL_NUM_THREADS" = "1" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_default_exec_properties='env-vars="MKL_NUM_THREADS": "{{limits.cpu.claimed}}"' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main diff --git a/_site/docs/quick_start.md b/_site/docs/quick_start.md index 7af957ee63..8a9a9234db 100644 --- a/_site/docs/quick_start.md +++ b/_site/docs/quick_start.md @@ -25,7 +25,8 @@ Let's start with a bazel workspace with a single file to compile into an executa Create a new directory for our workspace and add the following files: `main.cc`: -``` + +```c #include int main( int argc, char *argv[] ) @@ -35,7 +36,8 @@ int main( int argc, char *argv[] ) ``` `BUILD`: -``` + +```starlark cc_binary( name = "main", srcs = ["main.cc"], @@ -118,15 +120,18 @@ That `2 remote` indicates that your compile and link ran remotely. Congratulatio ## Container Quick Start To bring up a minimal buildfarm cluster, you can run: + +```shell +$ ./examples/bf-run start ``` -./examples/bf-run start -``` + This will start all of the necessary containers at the latest version. Once the containers are up, you can build with `bazel run --remote_executor=grpc://localhost:8980 :main`. To stop the containers, run: -``` -./examples/bf-run stop + +```shell +$ ./examples/bf-run stop ``` ## Next Steps @@ -137,8 +142,8 @@ We've started our worker on the same host as our server, and also the same host You can now easily launch a new Buildfarm cluster locally or in AWS using an open sourced [Buildfarm Manager](https://github.com/80degreeswest/bfmgr). -``` -wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar -java -jar bfmgr-1.0.7.jar -Navigate to http://localhost +```shell +$ wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar +$ java -jar bfmgr-1.0.7.jar +$ open http://localhost ``` From 018e177642caebf76f8baeeba2b15c7c3b6f5286 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Wed, 25 Oct 2023 16:54:59 -0400 Subject: [PATCH 48/58] Support OutputPaths in OutputDirectory Specifying any number of OutputPaths will ignore OutputFiles (consistent with uploads). Where an OutputPath specifies an output directory, the action must be able to create the directory itself. --- .../worker/shard/CFCExecFileSystem.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 041868f69c..d042ee0fb9 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -397,16 +397,25 @@ private Set linkedDirectories( return ImmutableSet.of(); } + private OutputDirectory createOutputDirectory(Command command) { + Iterable files; + Iterable dirs; + if (command.getOutputPathsCount() != 0) { + files = command.getOutputPathsList(); + dirs = ImmutableList.of(); // output paths require the action to create their own directory + } else { + files = command.getOutputFilesList(); + dirs = command.getOutputDirectoriesList(); + } + return OutputDirectory.parse(files, dirs, command.getEnvironmentVariablesList()); + } + @Override public Path createExecDir( String operationName, Map directoriesIndex, Action action, Command command) throws IOException, InterruptedException { Digest inputRootDigest = action.getInputRootDigest(); - OutputDirectory outputDirectory = - OutputDirectory.parse( - command.getOutputFilesList(), - command.getOutputDirectoriesList(), - command.getEnvironmentVariablesList()); + OutputDirectory outputDirectory = createOutputDirectory(command); Path execDir = root.resolve(operationName); if (Files.exists(execDir)) { From 8b370138da8f21401e78ed3d931e50194b1f4a06 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Thu, 26 Oct 2023 11:50:30 -0400 Subject: [PATCH 49/58] Permit Absolute Symlink Targets with configuration Partial specification of the absolute symlink response per REAPI. Remaining work will be in output identification. --- _site/docs/configuration/configuration.md | 15 +++--- examples/config.yml | 1 + .../common/config/BuildfarmConfigs.java | 1 + .../server/AbstractServerInstance.java | 18 +++++++ .../instance/shard/ShardInstance.java | 14 +++++ .../worker/shard/CFCExecFileSystem.java | 7 ++- .../build/buildfarm/worker/shard/Worker.java | 1 + .../server/AbstractServerInstanceTest.java | 51 +++++++++++++++++++ 8 files changed, 100 insertions(+), 8 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index ab3638f5b1..ccfc8e026c 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -28,13 +28,14 @@ For an example configuration containing all of the configuration values, see `ex ### Common -| Configuration | Accepted and _Default_ Values | Command Line Argument | Description | -|----------------------|-------------------------------|-----------------------|---------------------------------------------------| -| digestFunction | _SHA256_, SHA1 | | Digest function for this implementation | -| defaultActionTimeout | Integer, _600_ | | Default timeout value for an action (seconds) | -| maximumActionTimeout | Integer, _3600_ | | Maximum allowed action timeout (seconds) | -| maxEntrySizeBytes | Long, _2147483648_ | | Maximum size of a single blob accepted (bytes) | -| prometheusPort | Integer, _9090_ | --prometheus_port | Listening port of the Prometheus metrics endpoint | +| Configuration | Accepted and _Default_ Values | Command Line Argument | Description | +|------------------------------|-------------------------------|-----------------------|--------------------------------------------------------------| +| digestFunction | _SHA256_, SHA1 | | Digest function for this implementation | +| defaultActionTimeout | Integer, _600_ | | Default timeout value for an action (seconds) | +| maximumActionTimeout | Integer, _3600_ | | Maximum allowed action timeout (seconds) | +| maxEntrySizeBytes | Long, _2147483648_ | | Maximum size of a single blob accepted (bytes) | +| prometheusPort | Integer, _9090_ | --prometheus_port | Listening port of the Prometheus metrics endpoint | +| allowSymlinkTargetAbsolute | boolean, _false_ | | Permit inputs to contain symlinks with absolute path targets | Example: diff --git a/examples/config.yml b/examples/config.yml index bce19a3d60..5229fcd07c 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -3,6 +3,7 @@ defaultActionTimeout: 600 maximumActionTimeout: 3600 maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 prometheusPort: 9090 +allowSymlinkTargetAbsolute: false server: instanceType: SHARD name: shard diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 444d25a13d..d6ebbe9e14 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -36,6 +36,7 @@ public final class BuildfarmConfigs { private long maximumActionTimeout = 3600; private long maxEntrySizeBytes = 2147483648L; // 2 * 1024 * 1024 * 1024 private int prometheusPort = 9090; + private boolean allowSymlinkTargetAbsolute = false; private Server server = new Server(); private Backplane backplane = new Backplane(); private Worker worker = new Worker(); diff --git a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java b/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java index d828b7bd89..541e13701d 100644 --- a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java +++ b/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java @@ -178,6 +178,8 @@ public abstract class AbstractServerInstance implements Instance { public static final String ENVIRONMENT_VARIABLES_NOT_SORTED = "The `Command`'s `environment_variables` are not correctly sorted by `name`."; + public static final String SYMLINK_TARGET_ABSOLUTE = "A symlink target is absolute."; + public static final String MISSING_ACTION = "The action was not found in the CAS."; public static final String MISSING_COMMAND = "The command was not found in the CAS."; @@ -790,6 +792,7 @@ public static void validateActionInputDirectory( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -838,6 +841,14 @@ public static void validateActionInputDirectory( .setSubject("/" + directoryPath + ": " + lastSymlinkName + " > " + symlinkName) .setDescription(DIRECTORY_NOT_SORTED); } + String symlinkTarget = symlinkNode.getTarget(); + if (!allowSymlinkTargetAbsolute && symlinkTarget.charAt(0) == '/') { + preconditionFailure + .addViolationsBuilder() + .setType(VIOLATION_TYPE_INVALID) + .setSubject("/" + directoryPath + ": " + symlinkName + " -> " + symlinkTarget) + .setDescription(SYMLINK_TARGET_ABSOLUTE); + } /* FIXME serverside validity check? regex? Preconditions.checkState( isValidFilename(symlinkName), @@ -907,6 +918,7 @@ public static void validateActionInputDirectory( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -922,6 +934,7 @@ private static void validateActionInputDirectoryDigest( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -946,6 +959,7 @@ private static void validateActionInputDirectoryDigest( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -1203,12 +1217,16 @@ protected void validateAction( ImmutableSet.Builder inputFilesBuilder = ImmutableSet.builder(); inputDirectoriesBuilder.add(ACTION_INPUT_ROOT_DIRECTORY_PATH); + boolean allowSymlinkTargetAbsolute = + getCacheCapabilities().getSymlinkAbsolutePathStrategy() + == SymlinkAbsolutePathStrategy.Value.ALLOWED; validateActionInputDirectoryDigest( ACTION_INPUT_ROOT_DIRECTORY_PATH, action.getInputRootDigest(), new Stack<>(), new HashSet<>(), directoriesIndex, + allowSymlinkTargetAbsolute, inputFilesBuilder::add, inputDirectoriesBuilder::add, onInputDigest, diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java index c1c07b08f0..241f1a8fa1 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java @@ -47,6 +47,7 @@ import build.bazel.remote.execution.v2.Action; import build.bazel.remote.execution.v2.ActionResult; import build.bazel.remote.execution.v2.BatchReadBlobsResponse.Response; +import build.bazel.remote.execution.v2.CacheCapabilities; import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Compressor; import build.bazel.remote.execution.v2.Digest; @@ -60,6 +61,7 @@ import build.bazel.remote.execution.v2.Platform.Property; import build.bazel.remote.execution.v2.RequestMetadata; import build.bazel.remote.execution.v2.ResultsCachePolicy; +import build.bazel.remote.execution.v2.SymlinkAbsolutePathStrategy; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.actioncache.ShardActionCache; import build.buildfarm.backplane.Backplane; @@ -2726,4 +2728,16 @@ private boolean inDenyList(RequestMetadata requestMetadata) throws IOException { } return backplane.isBlacklisted(requestMetadata); } + + @Override + protected CacheCapabilities getCacheCapabilities() { + SymlinkAbsolutePathStrategy.Value symlinkAbsolutePathStrategy = + configs.isAllowSymlinkTargetAbsolute() + ? SymlinkAbsolutePathStrategy.Value.ALLOWED + : SymlinkAbsolutePathStrategy.Value.DISALLOWED; + return super.getCacheCapabilities() + .toBuilder() + .setSymlinkAbsolutePathStrategy(symlinkAbsolutePathStrategy) + .build(); + } } diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index d042ee0fb9..dba809fdcf 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -82,6 +82,9 @@ class CFCExecFileSystem implements ExecFileSystem { // indicate symlinking above for a set of matching paths private final Iterable linkedInputDirectories; + // permit symlinks to point to absolute paths in inputs + private final boolean allowSymlinkTargetAbsolute; + private final Map> rootInputFiles = new ConcurrentHashMap<>(); private final Map> rootInputDirectories = new ConcurrentHashMap<>(); private final ExecutorService fetchService = BuildfarmExecutors.getFetchServicePool(); @@ -95,6 +98,7 @@ class CFCExecFileSystem implements ExecFileSystem { @Nullable UserPrincipal owner, boolean linkInputDirectories, Iterable linkedInputDirectories, + boolean allowSymlinkTargetAbsolute, ExecutorService removeDirectoryService, ExecutorService accessRecorder) { this.root = root; @@ -104,6 +108,7 @@ class CFCExecFileSystem implements ExecFileSystem { this.linkedInputDirectories = Iterables.transform( linkedInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); + this.allowSymlinkTargetAbsolute = allowSymlinkTargetAbsolute; this.removeDirectoryService = removeDirectoryService; this.accessRecorder = accessRecorder; } @@ -179,7 +184,7 @@ public InputStream newInput(Compressor.Value compressor, Digest digest, long off private ListenableFuture putSymlink(Path path, SymlinkNode symlinkNode) { Path symlinkPath = path.resolve(symlinkNode.getName()); Path relativeTargetPath = path.getFileSystem().getPath(symlinkNode.getTarget()); - checkState(!relativeTargetPath.isAbsolute()); + checkState(allowSymlinkTargetAbsolute || !relativeTargetPath.isAbsolute()); return listeningDecorator(fetchService) .submit( () -> { diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 038097de7f..eaf8b6cb1a 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -337,6 +337,7 @@ private ExecFileSystem createCFCExecFileSystem( owner, configs.getWorker().isLinkInputDirectories(), configs.getWorker().getLinkedInputDirectories(), + configs.isAllowSymlinkTargetAbsolute(), removeDirectoryService, accessRecorder /* deadlineAfter=*/ diff --git a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java b/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java index add9136633..a23afaf0f7 100644 --- a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java @@ -23,6 +23,7 @@ import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_COMMAND; import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.AbstractServerInstance.SYMLINK_TARGET_ABSOLUTE; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; import static org.mockito.Mockito.any; @@ -43,6 +44,7 @@ import build.bazel.remote.execution.v2.OutputDirectory; import build.bazel.remote.execution.v2.Platform; import build.bazel.remote.execution.v2.RequestMetadata; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.cas.ContentAddressableStorage; @@ -270,6 +272,7 @@ public void duplicateFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFile=*/ file -> {}, /* onInputDirectorie=*/ directory -> {}, /* onInputDigest=*/ digest -> {}, @@ -304,6 +307,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(Digest.getDefaultInstance(), emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -327,6 +331,7 @@ public void unsortedFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -361,6 +366,7 @@ public void duplicateDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -395,6 +401,7 @@ public void unsortedDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -407,6 +414,48 @@ public void unsortedDirectoryInputIsInvalid() { assertThat(violation.getDescription()).isEqualTo(DIRECTORY_NOT_SORTED); } + @Test + public void shouldValidateIfSymlinkTargetAbsolute() { + // invalid for disallowed + PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); + Directory absoluteSymlinkDirectory = + Directory.newBuilder() + .addSymlinks(SymlinkNode.newBuilder().setName("foo").setTarget("/root/secret").build()) + .build(); + AbstractServerInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(1); + Violation violation = preconditionFailure.getViolationsList().get(0); + assertThat(violation.getType()).isEqualTo(VIOLATION_TYPE_INVALID); + assertThat(violation.getSubject()).isEqualTo("/: foo -> /root/secret"); + assertThat(violation.getDescription()).isEqualTo(SYMLINK_TARGET_ABSOLUTE); + + // valid for allowed + preconditionFailure = PreconditionFailure.newBuilder(); + AbstractServerInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ true, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(0); + } + @Test public void nestedOutputDirectoriesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); @@ -547,6 +596,7 @@ public void multipleIdenticalDirectoryMissingAreAllPreconditionFailures() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -608,6 +658,7 @@ public void validationRevisitReplicatesPreconditionFailures() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(fooDigest, foo), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, From df9ce1ddbd0a51b22eb535152794601125d59b84 Mon Sep 17 00:00:00 2001 From: Jason Schroeder Date: Sun, 29 Oct 2023 14:05:45 -0700 Subject: [PATCH 50/58] chore: update bazel to 6.4.0 (#1513) Trying to get more info on the Lombok stamping issue on Windows CI. See also https://github.com/bazelbuild/bazel/issues/10363 and https://github.com/bazelbuild/bazel/pull/18185 --- .bazelversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bazelversion b/.bazelversion index 5e3254243a..19b860c187 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -6.1.2 +6.4.0 From bd740c93510bc9c5abe2ba1860ae10bbbf4eae08 Mon Sep 17 00:00:00 2001 From: Trevor Hickey Date: Mon, 30 Oct 2023 12:22:15 -0400 Subject: [PATCH 51/58] Rename instance types (#1514) --- .../src/main/resources/proto/buildfarm.proto | 8 +-- .../build/buildfarm/instance/server/BUILD | 2 +- ...tServerInstance.java => NodeInstance.java} | 11 ++-- .../shard/RemoteInputStreamFactory.java | 2 +- ...ShardInstance.java => ServerInstance.java} | 28 +++++----- .../buildfarm/server/BuildFarmServer.java | 6 +-- .../build/buildfarm/worker/shard/Worker.java | 5 +- ...orkerInstance.java => WorkerInstance.java} | 8 +-- ...nstanceTest.java => NodeInstanceTest.java} | 53 +++++++++---------- .../java/build/buildfarm/instance/shard/BUILD | 4 +- ...tanceTest.java => ServerInstanceTest.java} | 12 ++--- ...tanceTest.java => WorkerInstanceTest.java} | 6 +-- 12 files changed, 71 insertions(+), 74 deletions(-) rename src/main/java/build/buildfarm/instance/server/{AbstractServerInstance.java => NodeInstance.java} (99%) rename src/main/java/build/buildfarm/instance/shard/{ShardInstance.java => ServerInstance.java} (99%) rename src/main/java/build/buildfarm/worker/shard/{ShardWorkerInstance.java => WorkerInstance.java} (98%) rename src/test/java/build/buildfarm/instance/server/{AbstractServerInstanceTest.java => NodeInstanceTest.java} (94%) rename src/test/java/build/buildfarm/instance/shard/{ShardInstanceTest.java => ServerInstanceTest.java} (99%) rename src/test/java/build/buildfarm/worker/shard/{ShardWorkerInstanceTest.java => WorkerInstanceTest.java} (97%) diff --git a/admin/main/src/main/resources/proto/buildfarm.proto b/admin/main/src/main/resources/proto/buildfarm.proto index 8a000132a3..d391847cb9 100644 --- a/admin/main/src/main/resources/proto/buildfarm.proto +++ b/admin/main/src/main/resources/proto/buildfarm.proto @@ -520,7 +520,7 @@ message RedisShardBackplaneConfig { int32 max_attempts = 33; } -message ShardInstanceConfig { +message ServerInstanceConfig { bool run_dispatched_monitor = 1; int32 dispatched_monitor_interval_seconds = 2; @@ -544,7 +544,7 @@ message ShardInstanceConfig { google.protobuf.Duration grpc_timeout = 8; } -message ShardWorkerInstanceConfig { +message WorkerInstanceConfig { // whether to stream stdout from processes bool stream_stdout = 6; @@ -568,7 +568,7 @@ message ShardWorkerInstanceConfig { } message ShardWorkerConfig { - ShardWorkerInstanceConfig shard_worker_instance_config = 1; + WorkerInstanceConfig shard_worker_instance_config = 1; int32 port = 2; @@ -836,7 +836,7 @@ message InstanceConfig { oneof type { MemoryInstanceConfig memory_instance_config = 3; - ShardInstanceConfig shard_instance_config = 4; + ServerInstanceConfig shard_instance_config = 4; } } diff --git a/src/main/java/build/buildfarm/instance/server/BUILD b/src/main/java/build/buildfarm/instance/server/BUILD index ecff968e79..1c63e9896c 100644 --- a/src/main/java/build/buildfarm/instance/server/BUILD +++ b/src/main/java/build/buildfarm/instance/server/BUILD @@ -1,8 +1,8 @@ java_library( name = "server", srcs = [ - "AbstractServerInstance.java", "GetDirectoryFunction.java", + "NodeInstance.java", "OperationsMap.java", "WatchFuture.java", ], diff --git a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java b/src/main/java/build/buildfarm/instance/server/NodeInstance.java similarity index 99% rename from src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java rename to src/main/java/build/buildfarm/instance/server/NodeInstance.java index 541e13701d..e44305f32e 100644 --- a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java +++ b/src/main/java/build/buildfarm/instance/server/NodeInstance.java @@ -144,7 +144,7 @@ import lombok.extern.java.Log; @Log -public abstract class AbstractServerInstance implements Instance { +public abstract class NodeInstance implements Instance { private final String name; protected final ContentAddressableStorage contentAddressableStorage; protected final ActionCache actionCache; @@ -229,7 +229,7 @@ public abstract class AbstractServerInstance implements Instance { public static final String NO_REQUEUE_COMPLETE_MESSAGE = "Operation %s not requeued. Operation has already completed."; - public AbstractServerInstance( + public NodeInstance( String name, DigestUtil digestUtil, ContentAddressableStorage contentAddressableStorage, @@ -1967,19 +1967,18 @@ public ServerCapabilities getCapabilities() { @Override public WorkerProfileMessage getWorkerProfile() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerProfile() method."); + "NodeInstance doesn't support getWorkerProfile() method."); } @Override public WorkerListMessage getWorkerList() { - throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerList() method."); + throw new UnsupportedOperationException("NodeInstance doesn't support getWorkerList() method."); } @Override public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support drainWorkerPipeline() method."); + "NodeInstance doesn't support drainWorkerPipeline() method."); } @Override diff --git a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java index 640878707f..8a00e9ae96 100644 --- a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java @@ -30,7 +30,7 @@ import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance.WorkersCallback; +import build.buildfarm.instance.shard.ServerInstance.WorkersCallback; import com.google.common.base.Throwables; import com.google.common.cache.LoadingCache; import com.google.common.collect.Iterables; diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java similarity index 99% rename from src/main/java/build/buildfarm/instance/shard/ShardInstance.java rename to src/main/java/build/buildfarm/instance/shard/ServerInstance.java index 241f1a8fa1..ab865b0e0e 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java @@ -81,7 +81,7 @@ import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.Instance; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -167,7 +167,7 @@ import lombok.extern.java.Log; @Log -public class ShardInstance extends AbstractServerInstance { +public class ServerInstance extends NodeInstance { private static final ListenableFuture IMMEDIATE_VOID_FUTURE = Futures.immediateFuture(null); private static final String TIMEOUT_OUT_OF_BOUNDS = @@ -259,14 +259,14 @@ private static Backplane createBackplane(String identifier) throws Configuration identifier, /* subscribeToBackplane=*/ true, configs.getServer().isRunFailsafeOperation(), - ShardInstance::stripOperation, - ShardInstance::stripQueuedOperation); + ServerInstance::stripOperation, + ServerInstance::stripQueuedOperation); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); } } - public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) + public ServerInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) throws InterruptedException, ConfigurationException { this( name, @@ -276,7 +276,7 @@ public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runn /* actionCacheFetchService=*/ BuildfarmExecutors.getActionCacheFetchServicePool()); } - private ShardInstance( + private ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -328,7 +328,7 @@ void initializeCaches() { .build(); } - public ShardInstance( + public ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -1086,7 +1086,7 @@ public void onError(Throwable t) { backplane, workerSet, locationSet, - ShardInstance.this::workerStub, + ServerInstance.this::workerStub, blobDigest, directExecutor(), RequestMetadata.getDefaultInstance()), @@ -2251,7 +2251,7 @@ public ListenableFuture queue(ExecuteEntry executeEntry, Poller poller, Du log.log( Level.FINER, format( - "ShardInstance(%s): checkCache(%s): %sus elapsed", + "ServerInstance(%s): checkCache(%s): %sus elapsed", getName(), operation.getName(), checkCacheUSecs)); return IMMEDIATE_VOID_FUTURE; } @@ -2278,7 +2278,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): fetching action %s", + "ServerInstance(%s): queue(%s): fetching action %s", getName(), operation.getName(), actionDigest.getHash())); RequestMetadata requestMetadata = executeEntry.getRequestMetadata(); ListenableFuture actionFuture = @@ -2321,7 +2321,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): fetched action %s transforming queuedOperation", + "ServerInstance(%s): queue(%s): fetched action %s transforming queuedOperation", getName(), operation.getName(), actionDigest.getHash())); Stopwatch transformStopwatch = Stopwatch.createStarted(); return transform( @@ -2351,7 +2351,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s transformed, validating", + "ServerInstance(%s): queue(%s): queuedOperation %s transformed, validating", getName(), operation.getName(), DigestUtil.toString( @@ -2373,7 +2373,7 @@ private ListenableFuture transformAndQueue( log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s validated, uploading", + "ServerInstance(%s): queue(%s): queuedOperation %s validated, uploading", getName(), operation.getName(), DigestUtil.toString( @@ -2425,7 +2425,7 @@ public void onSuccess(ProfiledQueuedOperationMetadata profiledQueuedMetadata) { log.log( Level.FINER, format( - "ShardInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", + "ServerInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", getName(), queueOperation.getName(), checkCacheUSecs, diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index a20f4a6f77..8672c0df76 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -29,7 +29,7 @@ import build.buildfarm.common.services.ByteStreamService; import build.buildfarm.common.services.ContentAddressableStorageService; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance; +import build.buildfarm.instance.shard.ServerInstance; import build.buildfarm.metrics.prometheus.PrometheusPublisher; import build.buildfarm.server.services.ActionCacheService; import build.buildfarm.server.services.CapabilitiesService; @@ -109,9 +109,9 @@ public void prepareServerForGracefulShutdown() { } } - private ShardInstance createInstance() + private ServerInstance createInstance() throws IOException, ConfigurationException, InterruptedException { - return new ShardInstance( + return new ServerInstance( configs.getServer().getName(), configs.getServer().getSession() + "-" + configs.getServer().getName(), new DigestUtil(configs.getDigestFunction()), diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index eaf8b6cb1a..fd556fc8e8 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -123,7 +123,7 @@ public final class Worker extends LoggingMain { private boolean inGracefulShutdown = false; private boolean isPaused = false; - private ShardWorkerInstance instance; + private WorkerInstance instance; @SuppressWarnings("deprecation") private final HealthStatusManager healthStatusManager = new HealthStatusManager(); @@ -532,8 +532,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep remoteInputStreamFactory, removeDirectoryService, accessRecorder, storage); instance = - new ShardWorkerInstance( - configs.getWorker().getPublicName(), digestUtil, backplane, storage); + new WorkerInstance(configs.getWorker().getPublicName(), digestUtil, backplane, storage); // Create the appropriate writer for the context CasWriter writer; diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java similarity index 98% rename from src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java rename to src/main/java/build/buildfarm/worker/shard/WorkerInstance.java index a891af7faa..e100417f53 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java @@ -36,7 +36,7 @@ import build.buildfarm.common.Write; import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -68,13 +68,13 @@ import lombok.extern.java.Log; @Log -public class ShardWorkerInstance extends AbstractServerInstance { +public class WorkerInstance extends NodeInstance { private static final Counter IO_METRIC = Counter.build().name("io_bytes_read").help("Read I/O (bytes)").register(); private final Backplane backplane; - public ShardWorkerInstance( + public WorkerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -346,7 +346,7 @@ protected static ExecuteOperationMetadata expectExecuteOperationMetadata(Operati return null; } } else { - return AbstractServerInstance.expectExecuteOperationMetadata(operation); + return NodeInstance.expectExecuteOperationMetadata(operation); } } diff --git a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java similarity index 94% rename from src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java rename to src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java index a23afaf0f7..454440281f 100644 --- a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java @@ -17,13 +17,13 @@ import static build.buildfarm.common.Actions.checkPreconditionFailure; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; -import static build.buildfarm.instance.server.AbstractServerInstance.DIRECTORY_NOT_SORTED; -import static build.buildfarm.instance.server.AbstractServerInstance.DUPLICATE_DIRENT; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_COMMAND; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; -import static build.buildfarm.instance.server.AbstractServerInstance.SYMLINK_TARGET_ABSOLUTE; +import static build.buildfarm.instance.server.NodeInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; +import static build.buildfarm.instance.server.NodeInstance.DIRECTORY_NOT_SORTED; +import static build.buildfarm.instance.server.NodeInstance.DUPLICATE_DIRENT; +import static build.buildfarm.instance.server.NodeInstance.INVALID_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.SYMLINK_TARGET_ABSOLUTE; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; import static org.mockito.Mockito.any; @@ -99,10 +99,10 @@ @RunWith(JUnit4.class) @Log -public class AbstractServerInstanceTest { +public class NodeInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); - static class DummyServerInstance extends AbstractServerInstance { + static class DummyServerInstance extends NodeInstance { DummyServerInstance( ContentAddressableStorage contentAddressableStorage, ActionCache actionCache) { super( @@ -261,7 +261,7 @@ public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() @Test public void duplicateFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -290,7 +290,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -320,7 +320,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { @Test public void unsortedFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -349,7 +349,7 @@ public void duplicateDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -384,7 +384,7 @@ public void unsortedDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -422,7 +422,7 @@ public void shouldValidateIfSymlinkTargetAbsolute() { Directory.newBuilder() .addSymlinks(SymlinkNode.newBuilder().setName("foo").setTarget("/root/secret").build()) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, absoluteSymlinkDirectory, /* pathDigests=*/ new Stack<>(), @@ -442,7 +442,7 @@ public void shouldValidateIfSymlinkTargetAbsolute() { // valid for allowed preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, absoluteSymlinkDirectory, /* pathDigests=*/ new Stack<>(), @@ -459,7 +459,7 @@ public void shouldValidateIfSymlinkTargetAbsolute() { @Test public void nestedOutputDirectoriesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of(), @@ -476,7 +476,7 @@ public void nestedOutputDirectoriesAreInvalid() { @Test public void outputDirectoriesContainingOutputFilesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo/bar"), @@ -493,7 +493,7 @@ public void outputDirectoriesContainingOutputFilesAreInvalid() { @Test public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo"), @@ -509,7 +509,7 @@ public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { @Test public void emptyArgumentListIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -529,7 +529,7 @@ public void emptyArgumentListIsInvalid() { @Test public void absoluteWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -549,7 +549,7 @@ public void absoluteWorkingDirectoryIsInvalid() { @Test public void undeclaredWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); Digest inputRootDigest = DIGEST_UTIL.compute(Directory.getDefaultInstance()); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); @@ -590,7 +590,7 @@ public void multipleIdenticalDirectoryMissingAreAllPreconditionFailures() { .setDigest(missingDirectoryDigest) .build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), @@ -652,7 +652,7 @@ public void validationRevisitReplicatesPreconditionFailures() { DirectoryNode.newBuilder().setName("bar").setDigest(fooDigest).build(), DirectoryNode.newBuilder().setName("foo").setDigest(fooDigest).build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), @@ -721,8 +721,7 @@ public void outputDirectoriesFilesAreEnsuredPresent() throws Exception { .build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); ActionCache actionCache = mock(ActionCache.class); - AbstractServerInstance instance = - new DummyServerInstance(contentAddressableStorage, actionCache); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, actionCache); Tree tree = Tree.newBuilder() @@ -782,7 +781,7 @@ public void fetchBlobWriteCompleteIsSuccess() throws Exception { Digest expectedDigest = contentDigest.toBuilder().setSizeBytes(-1).build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); - AbstractServerInstance instance = new DummyServerInstance(contentAddressableStorage, null); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, null); RequestMetadata requestMetadata = RequestMetadata.getDefaultInstance(); Write write = mock(Write.class); diff --git a/src/test/java/build/buildfarm/instance/shard/BUILD b/src/test/java/build/buildfarm/instance/shard/BUILD index cd0ae18d24..b635c74ba5 100644 --- a/src/test/java/build/buildfarm/instance/shard/BUILD +++ b/src/test/java/build/buildfarm/instance/shard/BUILD @@ -110,10 +110,10 @@ java_test( ) java_test( - name = "ShardInstanceTest", + name = "ServerInstanceTest", size = "small", srcs = [ - "ShardInstanceTest.java", + "ServerInstanceTest.java", "UnobservableWatcher.java", ], data = ["//examples:example_configs"], diff --git a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java similarity index 99% rename from src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java rename to src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java index aa1adc5d3d..cd959950a0 100644 --- a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java @@ -20,9 +20,9 @@ import static build.buildfarm.common.Actions.invalidActionVerboseMessage; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_PLATFORM; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_ACTION; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.INVALID_PLATFORM; +import static build.buildfarm.instance.server.NodeInstance.MISSING_ACTION; +import static build.buildfarm.instance.server.NodeInstance.MISSING_COMMAND; import static com.google.common.base.Predicates.notNull; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; @@ -121,14 +121,14 @@ import org.mockito.stubbing.Answer; @RunWith(JUnit4.class) -public class ShardInstanceTest { +public class ServerInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); private static final long QUEUE_TEST_TIMEOUT_SECONDS = 3; private static final Duration DEFAULT_TIMEOUT = Durations.fromSeconds(60); private static final Command SIMPLE_COMMAND = Command.newBuilder().addAllArguments(ImmutableList.of("true")).build(); - private ShardInstance instance; + private ServerInstance instance; private Map blobDigests; @Mock private Backplane mockBackplane; @@ -145,7 +145,7 @@ public void setUp() throws InterruptedException { blobDigests = Maps.newHashMap(); ActionCache actionCache = new ShardActionCache(10, mockBackplane, newDirectExecutorService()); instance = - new ShardInstance( + new ServerInstance( "shard", DIGEST_UTIL, mockBackplane, diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java similarity index 97% rename from src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java rename to src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java index 7a4e40be26..3df73187bb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java +++ b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java @@ -50,19 +50,19 @@ import org.mockito.MockitoAnnotations; @RunWith(JUnit4.class) -public class ShardWorkerInstanceTest { +public class WorkerInstanceTest { private final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); @Mock private Backplane backplane; @Mock private ContentAddressableStorage storage; - private ShardWorkerInstance instance; + private WorkerInstance instance; @Before public void setUp() throws Exception { MockitoAnnotations.initMocks(this); - instance = new ShardWorkerInstance("test", DIGEST_UTIL, backplane, storage); + instance = new WorkerInstance("test", DIGEST_UTIL, backplane, storage); } @SuppressWarnings("unchecked") From 2a61f77dc1713f276674bdbc9a839ec2732feb07 Mon Sep 17 00:00:00 2001 From: George Gensure Date: Mon, 30 Oct 2023 14:48:00 -0400 Subject: [PATCH 52/58] Create SymlinkNode outputs during upload (#1515) Default disabled, available with createSymlinkOutputs option in Worker config. --- _site/docs/configuration/configuration.md | 1 + examples/config.yml | 1 + .../build/buildfarm/common/config/Worker.java | 1 + .../worker/shard/ShardWorkerContext.java | 38 +++++++++++++++++-- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index ccfc8e026c..466375373f 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -262,6 +262,7 @@ backplane: | errorOperationRemainingResources | boolean, _false_ | | | | realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | | gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for operations in flight to finish when shutdown signal is received | +| createSymlinkOutputs | boolean, _false_ | | Creates SymlinkNodes for symbolic links discovered in output paths for actions. No verification of the symlink target path occurs. Buildstream, for example, requires this. | ```yaml worker: diff --git a/examples/config.yml b/examples/config.yml index 5229fcd07c..008bf2e1ca 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -131,6 +131,7 @@ worker: alwaysUse: false selectForBlockNetwork: false selectForTmpFs: false + createSymlinkOutputs: false executionPolicies: - name: test executionWrapper: diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index e987c1e370..c446134fe2 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -39,6 +39,7 @@ public class Worker { private int gracefulShutdownSeconds = 0; private ExecutionPolicy[] executionPolicies = {}; private SandboxSettings sandboxSettings = new SandboxSettings(); + private boolean createSymlinkOutputs = false; // These limited resources are only for the individual worker. // An example would be hardware resources such as GPUs. diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index c844f53963..98474b439c 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -31,6 +31,7 @@ import build.bazel.remote.execution.v2.ExecutionStage; import build.bazel.remote.execution.v2.FileNode; import build.bazel.remote.execution.v2.Platform; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.backplane.Backplane; import build.buildfarm.common.CommandUtils; @@ -572,6 +573,7 @@ private void uploadOutputFile( static class OutputDirectoryContext { private final List files = new ArrayList<>(); private final List directories = new ArrayList<>(); + private final List symlinks = new ArrayList<>(); void addFile(FileNode fileNode) { files.add(fileNode); @@ -581,10 +583,19 @@ void addDirectory(DirectoryNode directoryNode) { directories.add(directoryNode); } + void addSymlink(SymlinkNode symlinkNode) { + symlinks.add(symlinkNode); + } + Directory toDirectory() { files.sort(Comparator.comparing(FileNode::getName)); directories.sort(Comparator.comparing(DirectoryNode::getName)); - return Directory.newBuilder().addAllFiles(files).addAllDirectories(directories).build(); + symlinks.sort(Comparator.comparing(SymlinkNode::getName)); + return Directory.newBuilder() + .addAllFiles(files) + .addAllDirectories(directories) + .addAllSymlinks(symlinks) + .build(); } } @@ -621,8 +632,30 @@ private void uploadOutputDirectory( @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (configs.getWorker().isCreateSymlinkOutputs() && attrs.isSymbolicLink()) { + visitSymbolicLink(file); + } else { + visitRegularFile(file, attrs); + } + return FileVisitResult.CONTINUE; + } + + private void visitSymbolicLink(Path file) throws IOException { + // TODO convert symlinks with absolute targets within execution root to relative ones + currentDirectory.addSymlink( + SymlinkNode.newBuilder() + .setName(file.getFileName().toString()) + .setTarget(Files.readSymbolicLink(file).toString()) + .build()); + } + + private void visitRegularFile(Path file, BasicFileAttributes attrs) throws IOException { Digest digest; try { + // should we create symlink nodes in output? + // is buildstream trying to execute in a specific container?? + // can get to NSFE for nonexistent symlinks + // can fail outright for a symlink to a directory digest = getDigestUtil().compute(file); } catch (NoSuchFileException e) { log.log( @@ -631,7 +664,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) "error visiting file %s under output dir %s", outputDirPath.relativize(file), outputDirPath.toAbsolutePath()), e); - return FileVisitResult.CONTINUE; + return; } // should we cast to PosixFilePermissions and do gymnastics there for executable? @@ -655,7 +688,6 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) .setDescription( "An output could not be uploaded because it exceeded the maximum size of an entry"); } - return FileVisitResult.CONTINUE; } @Override From 76c2657111e5ebe92bdc99195a892195aa992be0 Mon Sep 17 00:00:00 2001 From: amishra-u <119983081+amishra-u@users.noreply.github.com> Date: Mon, 30 Oct 2023 19:26:18 -0700 Subject: [PATCH 53/58] feat: Implement CAS lease extension (#1455) Problem Enabling the findMissingBlobsViaBackplane flag in BuildfarmServer eliminates the need for the BuildfarmWorker's fmb API call. This BuildfarmWorker:fmb call was also responsible for tracking CAS entry access. As result, our CAS cache eviction strategy shifted from LRU to FIFO. When the findMissingBlobsViaBackplane flag is enabled, the buildfarm relies on the backplane as the definitive source for CAS availability. Since we don't update CAS expiry on each access, the backplane will independently expire CAS entries based on the specified cas_expire duration, even if they are actively being read. Solution Updated bfServer:fmb call to perform non-blocking fmb calls to workers, allowing these workers to record access for the relevant CAS entries. Extended expiry duration for available CAS entries in the backplane on each fmb call. With these changes, we can utilize Bazel's experimental_remote_cache_lease_extension and experimental_remote_cache_ttl flags for incremental builds. Closes #1428 --- defs.bzl | 1 + .../build/buildfarm/backplane/Backplane.java | 3 + .../instance/shard/CasWorkerMap.java | 7 +++ .../instance/shard/JedisCasWorkerMap.java | 11 ++++ .../instance/shard/RedisShardBackplane.java | 5 ++ .../instance/shard/RedissonCasWorkerMap.java | 8 +++ .../instance/shard/ServerInstance.java | 57 ++++++++++++++--- .../java/build/buildfarm/instance/shard/BUILD | 19 ++++++ .../instance/shard/JedisCasWorkerMapTest.java | 63 +++++++++++++++++++ .../instance/shard/ServerInstanceTest.java | 14 +++++ 10 files changed, 180 insertions(+), 8 deletions(-) create mode 100644 src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java diff --git a/defs.bzl b/defs.bzl index 5b0e4f640c..4b0a5e49bf 100644 --- a/defs.bzl +++ b/defs.bzl @@ -60,6 +60,7 @@ def buildfarm_init(name = "buildfarm"): "com.fasterxml.jackson.core:jackson-databind:2.15.0", "com.github.ben-manes.caffeine:caffeine:2.9.0", "com.github.docker-java:docker-java:3.2.11", + "com.github.fppt:jedis-mock:1.0.10", "com.github.jnr:jffi:1.2.16", "com.github.jnr:jffi:jar:native:1.2.16", "com.github.jnr:jnr-constants:0.9.9", diff --git a/src/main/java/build/buildfarm/backplane/Backplane.java b/src/main/java/build/buildfarm/backplane/Backplane.java index 557a7dc163..e13616b32e 100644 --- a/src/main/java/build/buildfarm/backplane/Backplane.java +++ b/src/main/java/build/buildfarm/backplane/Backplane.java @@ -278,4 +278,7 @@ boolean pollOperation(QueueEntry queueEntry, ExecutionStage.Value stage, long re Boolean propertiesEligibleForQueue(List provisions); GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest request) throws IOException; + + /** Set expiry time for digests */ + void updateDigestsExpiry(Iterable digests) throws IOException; } diff --git a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java index 794b296a1f..55b2933e42 100644 --- a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java @@ -121,4 +121,11 @@ Map> getMap(RedisClient client, Iterable blobDigests * @note Suggested return identifier: mapSize. */ int size(RedisClient client) throws IOException; + + /** + * @brief Set the expiry duration for the digests. + * @param client Client used for interacting with redis when not using cacheMap. + * @param blobDigests The blob digests to set new the expiry duration. + */ + void setExpire(RedisClient client, Iterable blobDigests) throws IOException; } diff --git a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java index d035d10491..65d1f06b87 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java @@ -234,6 +234,17 @@ public int size(RedisClient client) throws IOException { return client.call(jedis -> ScanCount.get(jedis, name + ":*", 1000)); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) throws IOException { + client.run( + jedis -> { + for (Digest blobDigest : blobDigests) { + String key = redisCasKey(blobDigest); + jedis.expire(key, keyExpiration_s); + } + }); + } + /** * @brief Get the redis key name. * @details This is to be used for the direct redis implementation. diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index ebb832f5aa..535473f2a9 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -1501,4 +1501,9 @@ public GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest req } return GetClientStartTimeResult.newBuilder().addAllClientStartTime(startTimes).build(); } + + @Override + public void updateDigestsExpiry(Iterable digests) throws IOException { + state.casWorkerMap.setExpire(client, digests); + } } diff --git a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java index 234e15fb15..52b010c31f 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java @@ -209,6 +209,14 @@ public int size(RedisClient client) { return cacheMap.size(); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) { + for (Digest blobDigest : blobDigests) { + String key = cacheMapCasKey(blobDigest); + cacheMap.expireKey(key, keyExpiration_s, TimeUnit.SECONDS); + } + } + /** * @brief Get a random element from the set. * @details Assumes the set is not empty. diff --git a/src/main/java/build/buildfarm/instance/shard/ServerInstance.java b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java index ab865b0e0e..b1cbed0e07 100644 --- a/src/main/java/build/buildfarm/instance/shard/ServerInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java @@ -135,10 +135,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.time.Instant; +import java.util.AbstractMap; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.Deque; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -660,7 +662,7 @@ public ListenableFuture> findMissingBlobs( } if (configs.getServer().isFindMissingBlobsViaBackplane()) { - return findMissingBlobsViaBackplane(nonEmptyDigests); + return findMissingBlobsViaBackplane(nonEmptyDigests, requestMetadata); } return findMissingBlobsQueryingEachWorker(nonEmptyDigests, requestMetadata); @@ -727,24 +729,40 @@ private ListenableFuture> findMissingBlobsQueryingEachWorker( // out-of-date and the server lies about which blobs are actually present. We provide this // alternative strategy for calculating missing blobs. private ListenableFuture> findMissingBlobsViaBackplane( - Iterable nonEmptyDigests) { + Iterable nonEmptyDigests, RequestMetadata requestMetadata) { try { Set uniqueDigests = new HashSet<>(); nonEmptyDigests.forEach(uniqueDigests::add); Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); Set workerSet = backplane.getStorageWorkers(); Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); - return immediateFuture( + Map> digestAndWorkersMap = uniqueDigests.stream() - .filter( // best effort to present digests only missing on active workers + .map( digest -> { Set initialWorkers = foundBlobs.getOrDefault(digest, Collections.emptySet()); - return filterAndAdjustWorkersForDigest( - digest, initialWorkers, workerSet, workersStartTime) - .isEmpty(); + return new AbstractMap.SimpleEntry<>( + digest, + filterAndAdjustWorkersForDigest( + digest, initialWorkers, workerSet, workersStartTime)); }) - .collect(Collectors.toList())); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + ListenableFuture> missingDigestFuture = + immediateFuture( + digestAndWorkersMap.entrySet().stream() + .filter(entry -> entry.getValue().isEmpty()) + .map(Map.Entry::getKey) + .collect(Collectors.toList())); + return transformAsync( + missingDigestFuture, + (missingDigest) -> { + extendLeaseForDigests(digestAndWorkersMap, requestMetadata); + return immediateFuture(missingDigest); + }, + // Propagate context values but don't cascade its cancellation for downstream calls. + Context.current().fork().fixedContextExecutor(directExecutor())); } catch (Exception e) { log.log(Level.SEVERE, "find missing blob via backplane failed", e); return immediateFailedFuture(Status.fromThrowable(e).asException()); @@ -789,6 +807,29 @@ private Set filterAndAdjustWorkersForDigest( return workersStartedBeforeDigestInsertion; } + private void extendLeaseForDigests( + Map> digestAndWorkersMap, RequestMetadata requestMetadata) { + Map> workerAndDigestMap = new HashMap<>(); + digestAndWorkersMap.forEach( + (digest, workers) -> + workers.forEach( + worker -> + workerAndDigestMap.computeIfAbsent(worker, w -> new HashSet<>()).add(digest))); + + workerAndDigestMap.forEach( + (worker, digests) -> workerStub(worker).findMissingBlobs(digests, requestMetadata)); + + try { + backplane.updateDigestsExpiry(digestAndWorkersMap.keySet()); + } catch (IOException e) { + log.log( + Level.WARNING, + format( + "Failed to update expiry duration for digests (%s) insertion time", + digestAndWorkersMap.keySet())); + } + } + private void findMissingBlobsOnWorker( String requestId, Iterable blobDigests, diff --git a/src/test/java/build/buildfarm/instance/shard/BUILD b/src/test/java/build/buildfarm/instance/shard/BUILD index b635c74ba5..2f08b1003d 100644 --- a/src/test/java/build/buildfarm/instance/shard/BUILD +++ b/src/test/java/build/buildfarm/instance/shard/BUILD @@ -219,3 +219,22 @@ java_test( "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", ], ) + +java_test( + name = "JedisCasWorkerMapTest", + size = "small", + srcs = [ + "JedisCasWorkerMapTest.java", + ], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/redis", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@maven//:com_github_fppt_jedis_mock", + "@maven//:com_google_truth_truth", + ], +) diff --git a/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java new file mode 100644 index 0000000000..caa69536c1 --- /dev/null +++ b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java @@ -0,0 +1,63 @@ +package build.buildfarm.instance.shard; + +import static com.google.common.truth.Truth.assertThat; + +import build.bazel.remote.execution.v2.Digest; +import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.redis.RedisClient; +import com.github.fppt.jedismock.RedisServer; +import com.github.fppt.jedismock.server.ServiceOptions; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import redis.clients.jedis.HostAndPort; +import redis.clients.jedis.JedisCluster; + +@RunWith(JUnit4.class) +public class JedisCasWorkerMapTest { + + private static final String CAS_PREFIX = "ContentAddressableStorage"; + + private RedisServer redisServer; + private RedisClient redisClient; + private JedisCasWorkerMap jedisCasWorkerMap; + + @Before + public void setup() throws IOException { + redisServer = + RedisServer.newRedisServer() + .setOptions(ServiceOptions.defaultOptions().withClusterModeEnabled()) + .start(); + redisClient = + new RedisClient( + new JedisCluster( + Collections.singleton( + new HostAndPort(redisServer.getHost(), redisServer.getBindPort())))); + jedisCasWorkerMap = new JedisCasWorkerMap(CAS_PREFIX, 60); + } + + @Test + public void testSetExpire() throws IOException { + Digest testDigest1 = Digest.newBuilder().setHash("abc").build(); + Digest testDigest2 = Digest.newBuilder().setHash("xyz").build(); + + String casKey1 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest1); + String casKey2 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest2); + + redisClient.run(jedis -> jedis.sadd(casKey1, "worker1")); + jedisCasWorkerMap.setExpire(redisClient, Arrays.asList(testDigest1, testDigest2)); + + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey1))).isGreaterThan(0L); + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey2))).isEqualTo(-2L); + } + + @After + public void tearDown() throws IOException { + redisServer.stop(); + } +} diff --git a/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java index cd959950a0..d66a600dab 100644 --- a/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java @@ -31,9 +31,13 @@ import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.SECONDS; import static org.mockito.AdditionalAnswers.answer; +import static org.mockito.ArgumentMatchers.anyIterable; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atLeast; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; @@ -80,6 +84,7 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.longrunning.Operation; import com.google.protobuf.Any; @@ -95,6 +100,7 @@ import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -1110,8 +1116,12 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { buildfarmConfigs.getServer().setFindMissingBlobsViaBackplane(true); Set activeAndImposterWorkers = Sets.newHashSet(Iterables.concat(activeWorkers, imposterWorkers)); + when(mockBackplane.getStorageWorkers()).thenReturn(activeAndImposterWorkers); when(mockBackplane.getBlobDigestsWorkers(any(Iterable.class))).thenReturn(digestAndWorkersMap); + when(mockInstanceLoader.load(anyString())).thenReturn(mockWorkerInstance); + when(mockWorkerInstance.findMissingBlobs(anyIterable(), any(RequestMetadata.class))) + .thenReturn(Futures.immediateFuture(new ArrayList<>())); long serverStartTime = 1686951033L; // june 15th, 2023 Map workersStartTime = new HashMap<>(); @@ -1134,6 +1144,10 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { Iterables.concat(missingDigests, digestAvailableOnImposters); assertThat(actualMissingDigests).containsExactlyElementsIn(expectedMissingDigests); + verify(mockWorkerInstance, atMost(3)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); + verify(mockWorkerInstance, atLeast(1)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); for (Digest digest : actualMissingDigests) { assertThat(digest).isNotIn(availableDigests); From cfa2e18724914034eb19c4c01f293117610fa0b7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:48:35 -0400 Subject: [PATCH 54/58] Bump org.json:json from 20230227 to 20231013 in /admin/main (#1516) Bumps [org.json:json](https://github.com/douglascrockford/JSON-java) from 20230227 to 20231013. - [Release notes](https://github.com/douglascrockford/JSON-java/releases) - [Changelog](https://github.com/stleary/JSON-java/blob/master/docs/RELEASES.md) - [Commits](https://github.com/douglascrockford/JSON-java/commits) --- updated-dependencies: - dependency-name: org.json:json dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- admin/main/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/admin/main/pom.xml b/admin/main/pom.xml index 006217ccb4..8e128ee64c 100644 --- a/admin/main/pom.xml +++ b/admin/main/pom.xml @@ -94,7 +94,7 @@ org.json json - 20230227 + 20231013 org.projectlombok From 3167fc51e49547042125992bf27bb2bf4973cfc5 Mon Sep 17 00:00:00 2001 From: Jackie Springstead-Chen Date: Mon, 16 Sep 2024 15:50:24 -0400 Subject: [PATCH 55/58] Remove custom Spring exit --- .../java/build/buildfarm/worker/Pipeline.java | 9 +--- .../build/buildfarm/worker/shard/Worker.java | 47 +------------------ .../build/buildfarm/worker/PipelineTest.java | 4 +- 3 files changed, 4 insertions(+), 56 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/Pipeline.java b/src/main/java/build/buildfarm/worker/Pipeline.java index 4198b537e6..aa801d0e92 100644 --- a/src/main/java/build/buildfarm/worker/Pipeline.java +++ b/src/main/java/build/buildfarm/worker/Pipeline.java @@ -44,14 +44,7 @@ public void add(PipelineStage stage, int closePriority) { stageClosePriorities.put(stage, closePriority); } - /** - * Start the pipeline. - * - *

You can provide callback which is invoked when any stage has an uncaught exception, for - * instance to shutdown the worker gracefully - */ - public void start(SettableFuture uncaughtExceptionFuture) { - stageThreadGroup.setUncaughtExceptionFuture(uncaughtExceptionFuture); + public void start() { for (Thread stageThread : stageThreads.values()) { stageThread.start(); } diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index d740dd7341..e3cc71a4bd 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -94,10 +94,8 @@ import javax.annotation.PreDestroy; import javax.naming.ConfigurationException; import lombok.extern.java.Log; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.ComponentScan; @Log @@ -145,7 +143,6 @@ public class Worker { private Backplane backplane; private LoadingCache workerStubs; - @Autowired private ApplicationContext springContext; /** * The method will prepare the worker for graceful shutdown when the worker is ready. Note on * using stderr here instead of log. By the time this is called in PreDestroy, the log is no @@ -194,43 +191,6 @@ public void prepareWorkerForGracefulShutdown() { } } - private void exitPostPipelineFailure() { - // Shutdown the worker if a pipeline fails. By means of the spring lifecycle - // hooks - e.g. the `PreDestroy` hook here - it will attempt to gracefully - // spin down the pipeline - - // By calling these spring shutdown facilities; we're open to the risk that - // a subsystem may be hanging a criticial thread indeffinitly. Deadline the - // shutdown workflow to ensure we don't leave a zombie worker in this - // situation - ScheduledExecutorService shutdownDeadlineExecutor = newSingleThreadScheduledExecutor(); - - // This may be shorter than the action timeout; assume we have interrupted - // actions in a fatal uncaught exception. - int forceShutdownDeadline = 60; - ScheduledFuture termFuture = - shutdownDeadlineExecutor.schedule( - new Runnable() { - public void run() { - log.log( - Level.SEVERE, - String.format( - "Force terminating due to shutdown deadline exceeded (%d seconds)", - forceShutdownDeadline)); - System.exit(1); - } - }, - forceShutdownDeadline, - SECONDS); - - // Consider defining exit codes to better afford out of band instance - // recovery - int code = SpringApplication.exit(springContext, () -> 1); - termFuture.cancel(false); - shutdownDeadlineExecutor.shutdown(); - System.exit(code); - } - private Operation stripOperation(Operation operation) { return instance.stripOperation(operation); } @@ -667,12 +627,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep PrometheusPublisher.startHttpServer(configs.getPrometheusPort()); startFailsafeRegistration(); - // Listen for pipeline unhandled exceptions - ExecutorService pipelineExceptionExecutor = newSingleThreadExecutor(); - SettableFuture pipelineExceptionFuture = SettableFuture.create(); - pipelineExceptionFuture.addListener(this::exitPostPipelineFailure, pipelineExceptionExecutor); - - pipeline.start(pipelineExceptionFuture); + pipeline.start(); healthCheckMetric.labels("start").inc(); executionSlotsTotal.set(configs.getWorker().getExecuteStageWidth()); diff --git a/src/test/java/build/buildfarm/worker/PipelineTest.java b/src/test/java/build/buildfarm/worker/PipelineTest.java index 713f1b0345..f414b8cefe 100644 --- a/src/test/java/build/buildfarm/worker/PipelineTest.java +++ b/src/test/java/build/buildfarm/worker/PipelineTest.java @@ -53,7 +53,7 @@ public void stageThreadReturnCompletesJoin() throws InterruptedException { public void run() {} }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } @@ -68,7 +68,7 @@ public void run() { } }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } } From 7225f817efcf504909e256874eea385831480638 Mon Sep 17 00:00:00 2001 From: Jackie Springstead-Chen Date: Mon, 23 Sep 2024 12:58:53 -0400 Subject: [PATCH 56/58] Pass failing tests --- .../java/build/buildfarm/worker/shard/CFCExecFileSystem.java | 1 + .../buildfarm/instance/shard/RedisShardBackplaneTest.java | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index 9460f505da..264151ef48 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -40,6 +40,7 @@ import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.common.BuildfarmExecutors; +import build.buildfarm.common.DigestUtil; import build.buildfarm.common.io.Directories; import build.buildfarm.common.io.Dirent; import build.buildfarm.worker.ExecDirException; diff --git a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java index 9dd3dfc615..a77e74a3bd 100644 --- a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java +++ b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java @@ -407,8 +407,7 @@ public void testAddWorker() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); when(jedisCluster.hset(anyString(), anyString(), anyString())).thenReturn(1L); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("addWorker/test:0000"); backplane.addWorker(shardWorker); verify(jedisCluster, times(1)) From 5a507ce1d31a661b27bfd6f52799ce496723701c Mon Sep 17 00:00:00 2001 From: Jackie Springstead-Chen Date: Thu, 26 Sep 2024 14:55:26 -0400 Subject: [PATCH 57/58] Temporarily make worker public --- src/main/java/build/buildfarm/worker/shard/Worker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index ca08d4c72b..2324f808a2 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -142,7 +142,7 @@ public final class Worker extends LoggingMain { private LoadingCache workerStubs; private AtomicBoolean released = new AtomicBoolean(true); - private Worker() { + public Worker() { super("BuildFarmShardWorker"); } From cd5591ad01b67b053eee75e9a81a5803845c6679 Mon Sep 17 00:00:00 2001 From: Yuriy Belenitsky Date: Wed, 1 Nov 2023 16:14:45 -0400 Subject: [PATCH 58/58] Re-add missing graceful shutdown functionality (#1520) --- .../buildfarm/server/BuildFarmServer.java | 8 ++-- .../build/buildfarm/worker/shard/Worker.java | 47 +++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index 8672c0df76..c448963209 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -88,20 +88,20 @@ public class BuildFarmServer extends LoggingMain { */ public void prepareServerForGracefulShutdown() { if (configs.getServer().getGracefulShutdownSeconds() == 0) { - System.err.println( + log.info( String.format("Graceful Shutdown is not enabled. Server is shutting down immediately.")); } else { try { - System.err.println( + log.info( String.format( "Graceful Shutdown - Waiting %d to allow connections to drain.", configs.getServer().getGracefulShutdownSeconds())); SECONDS.sleep(configs.getServer().getGracefulShutdownSeconds()); } catch (InterruptedException e) { - System.err.println( + log.info( "Graceful Shutdown - The server graceful shutdown is interrupted: " + e.getMessage()); } finally { - System.err.println( + log.info( String.format( "Graceful Shutdown - It took the server %d seconds to shutdown", configs.getServer().getGracefulShutdownSeconds())); diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 2324f808a2..7f1bee11b8 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -142,6 +142,52 @@ public final class Worker extends LoggingMain { private LoadingCache workerStubs; private AtomicBoolean released = new AtomicBoolean(true); + /** + * The method will prepare the worker for graceful shutdown when the worker is ready. Note on + * using stderr here instead of log. By the time this is called in PreDestroy, the log is no + * longer available and is not logging messages. + */ + public void prepareWorkerForGracefulShutdown() { + if (configs.getWorker().getGracefulShutdownSeconds() == 0) { + log.info( + String.format( + "Graceful Shutdown is not enabled. Worker is shutting down without finishing executions in progress.")); + } else { + inGracefulShutdown = true; + log.info( + "Graceful Shutdown - The current worker will not be registered again and should be shutdown gracefully!"); + pipeline.stopMatchingOperations(); + int scanRate = 30; // check every 30 seconds + int timeWaited = 0; + int timeOut = configs.getWorker().getGracefulShutdownSeconds(); + try { + if (pipeline.isEmpty()) { + log.info("Graceful Shutdown - no work in the pipeline."); + } else { + log.info(String.format("Graceful Shutdown - waiting for executions to finish.")); + } + while (!pipeline.isEmpty() && timeWaited < timeOut) { + SECONDS.sleep(scanRate); + timeWaited += scanRate; + log.info( + String.format( + "Graceful Shutdown - Pipeline is still not empty after %d seconds.", timeWaited)); + } + } catch (InterruptedException e) { + log.info( + "Graceful Shutdown - The worker gracefully shutdown is interrupted: " + e.getMessage()); + } finally { + log.info( + String.format( + "Graceful Shutdown - It took the worker %d seconds to %s", + timeWaited, + pipeline.isEmpty() + ? "finish all actions" + : "gracefully shutdown but still cannot finish all actions")); + } + } + } + public Worker() { super("BuildFarmShardWorker"); } @@ -632,6 +678,7 @@ public synchronized void stop() throws InterruptedException { private void shutdown() throws InterruptedException { log.info("*** shutting down gRPC server since JVM is shutting down"); + prepareWorkerForGracefulShutdown(); PrometheusPublisher.stopHttpServer(); boolean interrupted = Thread.interrupted(); if (pipeline != null) {