@@ -134,7 +134,9 @@ static bool daemonize(std::string dir) {
134
134
return true ;
135
135
}
136
136
137
- wcl::optional<wcl::unique_fd> try_connect (std::string dir) {
137
+ enum class TryConnectError { Generic, AddrBound };
138
+
139
+ wcl::result<wcl::unique_fd, TryConnectError> try_connect (std::string dir) {
138
140
wcl::unique_fd socket_fd;
139
141
{
140
142
int local_socket_fd = socket (AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0 );
@@ -151,7 +153,7 @@ wcl::optional<wcl::unique_fd> try_connect(std::string dir) {
151
153
auto fd = wcl::unique_fd::open (key_path.c_str (), O_RDONLY);
152
154
if (!fd) {
153
155
wcl::log::info (" open(%s): %s" , key_path.c_str (), strerror (fd.error ()))();
154
- return {} ;
156
+ return wcl::result_error<wcl::unique_fd>(TryConnectError::Generic) ;
155
157
}
156
158
157
159
// TODO: We should make this read more robust. It's mostly fine if
@@ -167,14 +169,20 @@ wcl::optional<wcl::unique_fd> try_connect(std::string dir) {
167
169
addr.sun_family = AF_UNIX;
168
170
addr.sun_path [0 ] = ' \0 ' ;
169
171
172
+ // Among the failures that can occur here, we could get EAGAIN. We can't use
173
+ // epoll to avoid this however if we get EAGAIN it means that we don't have
174
+ // to launch another process because the address is already bound.
170
175
wcl::log::info (" key = %s, sizeof(key) = %lu" , key, sizeof (key))();
171
176
memcpy (addr.sun_path + 1 , key, sizeof (key));
172
177
if (connect (socket_fd.get (), reinterpret_cast <const sockaddr *>(&addr), sizeof (key)) == -1 ) {
178
+ if (errno == EAGAIN) {
179
+ return wcl::result_error<wcl::unique_fd>(TryConnectError::AddrBound);
180
+ }
173
181
wcl::log::info (" connect(%s): %s" , key, strerror (errno))();
174
- return {} ;
182
+ return wcl::result_error<wcl::unique_fd>(TryConnectError::Generic) ;
175
183
}
176
184
177
- return wcl::make_some<wcl::unique_fd >(std::move (socket_fd));
185
+ return wcl::result_value<TryConnectError >(std::move (socket_fd));
178
186
}
179
187
180
188
// Launch the job cache daemon
@@ -209,16 +217,27 @@ wcl::result<wcl::unique_fd, ConnectError> Cache::backoff_try_connect(int attempt
209
217
wcl::xoshiro_256 rng (wcl::xoshiro_256::get_rng_seed ());
210
218
useconds_t backoff = 1000 ;
211
219
wcl::unique_fd socket_fd;
220
+ bool addr_bound = false ;
212
221
for (int i = 0 ; i < attempts; i++) {
213
222
// We normally connect in about 3 tries, sometimes 4 on fresh
214
223
// connect so if we haven't connected at this point its a good
215
224
// spot to start start trying.
216
- if (i > 4 ) {
225
+ // Additionally if on the previous connection attempt we gained evidence
226
+ // that the connection is already bound, we'll avoid relaunching for at
227
+ // most 1 round. This halves the number of extranious daemon launches
228
+ // when the daemon is under load.
229
+ if (i > 4 && !addr_bound) {
217
230
launch_daemon ();
218
231
}
219
232
233
+ addr_bound = false ;
220
234
auto fd_opt = try_connect (cache_dir);
221
235
if (!fd_opt) {
236
+ // If we receive this specific error then we know that a daemon exists,
237
+ // its just busy
238
+ if (fd_opt.error () == TryConnectError::AddrBound) {
239
+ addr_bound = true ;
240
+ }
222
241
std::uniform_int_distribution<useconds_t > variance (0 , backoff);
223
242
usleep (backoff + variance (rng));
224
243
backoff *= 2 ;
0 commit comments