Skip to content

Commit f239125

Browse files
authored
[LSC] Avoid relaunching the daemon if the socket is probably bound (#1489)
1 parent 68e7071 commit f239125

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

Diff for: src/job_cache/job_cache.cpp

+24-5
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,9 @@ static bool daemonize(std::string dir) {
134134
return true;
135135
}
136136

137-
wcl::optional<wcl::unique_fd> try_connect(std::string dir) {
137+
enum class TryConnectError { Generic, AddrBound };
138+
139+
wcl::result<wcl::unique_fd, TryConnectError> try_connect(std::string dir) {
138140
wcl::unique_fd socket_fd;
139141
{
140142
int local_socket_fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
@@ -151,7 +153,7 @@ wcl::optional<wcl::unique_fd> try_connect(std::string dir) {
151153
auto fd = wcl::unique_fd::open(key_path.c_str(), O_RDONLY);
152154
if (!fd) {
153155
wcl::log::info("open(%s): %s", key_path.c_str(), strerror(fd.error()))();
154-
return {};
156+
return wcl::result_error<wcl::unique_fd>(TryConnectError::Generic);
155157
}
156158

157159
// TODO: We should make this read more robust. It's mostly fine if
@@ -167,14 +169,20 @@ wcl::optional<wcl::unique_fd> try_connect(std::string dir) {
167169
addr.sun_family = AF_UNIX;
168170
addr.sun_path[0] = '\0';
169171

172+
// Among the failures that can occur here, we could get EAGAIN. We can't use
173+
// epoll to avoid this however if we get EAGAIN it means that we don't have
174+
// to launch another process because the address is already bound.
170175
wcl::log::info("key = %s, sizeof(key) = %lu", key, sizeof(key))();
171176
memcpy(addr.sun_path + 1, key, sizeof(key));
172177
if (connect(socket_fd.get(), reinterpret_cast<const sockaddr *>(&addr), sizeof(key)) == -1) {
178+
if (errno == EAGAIN) {
179+
return wcl::result_error<wcl::unique_fd>(TryConnectError::AddrBound);
180+
}
173181
wcl::log::info("connect(%s): %s", key, strerror(errno))();
174-
return {};
182+
return wcl::result_error<wcl::unique_fd>(TryConnectError::Generic);
175183
}
176184

177-
return wcl::make_some<wcl::unique_fd>(std::move(socket_fd));
185+
return wcl::result_value<TryConnectError>(std::move(socket_fd));
178186
}
179187

180188
// Launch the job cache daemon
@@ -209,16 +217,27 @@ wcl::result<wcl::unique_fd, ConnectError> Cache::backoff_try_connect(int attempt
209217
wcl::xoshiro_256 rng(wcl::xoshiro_256::get_rng_seed());
210218
useconds_t backoff = 1000;
211219
wcl::unique_fd socket_fd;
220+
bool addr_bound = false;
212221
for (int i = 0; i < attempts; i++) {
213222
// We normally connect in about 3 tries, sometimes 4 on fresh
214223
// connect so if we haven't connected at this point its a good
215224
// spot to start start trying.
216-
if (i > 4) {
225+
// Additionally if on the previous connection attempt we gained evidence
226+
// that the connection is already bound, we'll avoid relaunching for at
227+
// most 1 round. This halves the number of extranious daemon launches
228+
// when the daemon is under load.
229+
if (i > 4 && !addr_bound) {
217230
launch_daemon();
218231
}
219232

233+
addr_bound = false;
220234
auto fd_opt = try_connect(cache_dir);
221235
if (!fd_opt) {
236+
// If we receive this specific error then we know that a daemon exists,
237+
// its just busy
238+
if (fd_opt.error() == TryConnectError::AddrBound) {
239+
addr_bound = true;
240+
}
222241
std::uniform_int_distribution<useconds_t> variance(0, backoff);
223242
usleep(backoff + variance(rng));
224243
backoff *= 2;

0 commit comments

Comments
 (0)