Skip to content

Commit

Permalink
Send heartbeat only when needed
Browse files Browse the repository at this point in the history
this changes slightly how the heartbeat works:
Instead of sending it every 120s regularly, only send it if the last update to the server is 120s ago.

For example, in a case like this:
```
2024-05-19 11:44:59.219186+00:00 : run: 664937bcb8fa20e74c39f42e task: 126 size: 416 tc: 10+0.1 concurrency: 7 threads: 1 [ sprt : 800000 ]
2024-05-19 11:46:58.900077+00:00 :   1.02 ms (s)   152.93 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:48:55.849837+00:00 :   2.16 ms (s)   144.34 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:48:59.073522+00:00 :   1.55 ms (s)   160.57 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:50:46.055023+00:00 :   2.91 ms (s)   172.30 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:50:59.258581+00:00 :   3.02 ms (s)   149.73 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:52:28.238362+00:00 :   3.37 ms (s)   145.20 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:52:59.421249+00:00 :   1.31 ms (s)   147.74 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:54:23.428656+00:00 :   2.77 ms (s)   155.25 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:54:59.597397+00:00 :   1.61 ms (s)   144.52 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:56:18.618653+00:00 :   3.33 ms (s)   145.78 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:56:59.757185+00:00 :   1.40 ms (s)   143.08 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 11:58:10.802213+00:00 :   1.47 ms (s)   152.79 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 11:58:59.927009+00:00 :   1.24 ms (s)   141.62 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 12:00:01.989951+00:00 :   2.97 ms (s)   144.60 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 12:01:00.095270+00:00 :   1.50 ms (s)   154.71 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 12:01:54.193067+00:00 :   2.85 ms (s)   143.12 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 12:03:00.267084+00:00 :   0.55 ms (s)   151.15 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 12:03:56.383263+00:00 :   2.98 ms (s)   152.39 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 12:05:00.453999+00:00 :   3.06 ms (s)   145.32 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 12:05:48.587522+00:00 :   2.20 ms (s)   142.07 ms (w)  https://tests.stockfishchess.org:443/api/update_task
```
Essentially none of the beats needs to be sent, as the server is getting updates to the task more frequently than once every 2min anyway.

With this patch we have a following api log on a high core worker:
```
2024-05-19 14:19:13.526943+00:00 : run: 664a027fae57c1758ac5b4ee task: 286 size: 486 tc: 60+0.6 concurrency: 25 threads: 1 [ sprt : 800000 ]
2024-05-19 14:21:13.421935+00:00 :   1.54 ms (s)   169.11 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 14:23:13.680019+00:00 :   1.06 ms (s)   136.66 ms (w)  https://tests.stockfishchess.org:443/api/beat
2024-05-19 14:23:44.292362+00:00 :   3.54 ms (s)   170.59 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 14:23:58.479615+00:00 :   3.40 ms (s)   168.31 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 14:24:29.692896+00:00 :   3.09 ms (s)   177.37 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 14:25:12.928290+00:00 :   1.66 ms (s)   187.89 ms (w)  https://tests.stockfishchess.org:443/api/update_task
2024-05-19 14:27:04.244537+00:00 :   2.64 ms (s)   184.87 ms (w)  https://tests.stockfishchess.org:443/api/update_task
```
i.e. as soon as the games start finishing, the heartbeast is no longer sent.

While the beat api is cheap, it is the most frequent api call, so saving a few is a good thing.
  • Loading branch information
vondele committed May 19, 2024
1 parent 62486e4 commit 8e386e7
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
12 changes: 9 additions & 3 deletions worker/games.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ def results_to_score(results):


def parse_cutechess_output(
p, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit
p, current_state, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit
):
hash_pattern = re.compile(r"(Base|New)-[a-f0-9]+")

Expand Down Expand Up @@ -1051,6 +1051,8 @@ def shorten_hash(match):
time.sleep(UPDATE_RETRY_TIME)
if not update_succeeded:
raise WorkerException("Too many failed update attempts")
else:
current_state["last_updated"] = datetime.now(timezone.utc)

# Act on line like this:
# Finished game 4 (Base-SHA vs New-SHA): 1/2-1/2 {Draw by adjudication}
Expand All @@ -1065,7 +1067,7 @@ def shorten_hash(match):


def launch_cutechess(
cmd, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit
cmd, current_state, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit
):
if spsa_tuning:
# Request parameters for next game.
Expand Down Expand Up @@ -1144,6 +1146,7 @@ def launch_cutechess(
try:
task_alive = parse_cutechess_output(
p,
current_state,
remote,
result,
spsa_tuning,
Expand Down Expand Up @@ -1178,7 +1181,9 @@ def launch_cutechess(
return task_alive


def run_games(worker_info, password, remote, run, task_id, pgn_file, clear_binaries):
def run_games(
worker_info, current_state, password, remote, run, task_id, pgn_file, clear_binaries
):
# This is the main cutechess-cli driver.
# It is ok, and even expected, for this function to
# raise exceptions, implicitly or explicitly, if a
Expand Down Expand Up @@ -1548,6 +1553,7 @@ def make_player(arg):

task_alive = launch_cutechess(
cmd,
current_state,
remote,
result,
spsa_tuning,
Expand Down
2 changes: 1 addition & 1 deletion worker/sri.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"__version": 237, "updater.py": "Mg+pWOgGA0gSo2TuXuuLCWLzwGwH91rsW1W3ixg3jYauHQpRMtNdGnCfuD1GqOhV", "worker.py": "SZXQEuoQG97IqizDgOrsEuzOl2P5hGbkAqpCaU1tiIp86pz2tq8xevf9T3Ei89zn", "games.py": "KqvcMhLOyArHNTWpD6QAx+IFdZwg8aTRcvGf7eN5kn31V5a+G9Y7137hnrtFGy3K"}
{"__version": 237, "updater.py": "Mg+pWOgGA0gSo2TuXuuLCWLzwGwH91rsW1W3ixg3jYauHQpRMtNdGnCfuD1GqOhV", "worker.py": "Yv0ObjNdzzJStMOg5VEKpm6a2+6nodXZeyJqoiZu0cDZIy8OCtMunyqUmT9z0V/9", "games.py": "6vKH51UtL56oNvA539hLXRzgE1ADXy3QZNJohoK94RntM72+iMancSJZHaNjEb5+"}
26 changes: 18 additions & 8 deletions worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from configparser import ConfigParser
from contextlib import ExitStack
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from functools import partial
from pathlib import Path

Expand Down Expand Up @@ -1171,13 +1171,12 @@ def heartbeat(worker_info, password, remote, current_state):
"password": password,
"worker_info": worker_info,
}
count = 0
while current_state["alive"]:
time.sleep(1)
count += 1
if count == 120:
count = 0
now = datetime.now(timezone.utc)
if current_state["last_updated"] + timedelta(seconds=120) < now:
print(" Send heartbeat for", worker_info["unique_key"], end=" ... ")
current_state["last_updated"] = now
run = current_state["run"]
payload["run_id"] = str(run["_id"]) if run else None
task_id = current_state["task_id"]
Expand Down Expand Up @@ -1419,7 +1418,16 @@ def fetch_and_handle_task(
api = remote + "/api/failed_task"
pgn_file = [None]
try:
run_games(worker_info, password, remote, run, task_id, pgn_file, clear_binaries)
run_games(
worker_info,
current_state,
password,
remote,
run,
task_id,
pgn_file,
clear_binaries,
)
success = True
except FatalException as e:
message = str(e)
Expand Down Expand Up @@ -1528,8 +1536,10 @@ def worker():
current_state = {
"run": None, # the current run
"task_id": None, # the id of the current task
"alive": True, # controls the main loop and
# the heartbeat loop
"alive": True, # controls the main and heartbeat loop
"last_updated": datetime.now(
timezone.utc
), # tracks the last update to the server
}

# Install signal handlers.
Expand Down

0 comments on commit 8e386e7

Please sign in to comment.