Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ELO rating update logic in GhostTrainer #6138

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 29 additions & 20 deletions ml-agents/mlagents/trainers/ghost/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,26 +195,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
i.e. in asymmetric games. We assume the last reward determines the winner.
:param trajectory: Trajectory.
"""
if (
trajectory.done_reached
and trajectory.all_group_dones_reached
and not trajectory.interrupted
):
# Assumption is that final reward is >0/0/<0 for win/draw/loss
final_reward = (
trajectory.steps[-1].reward + trajectory.steps[-1].group_reward
)
result = 0.5
if final_reward > 0:
result = 1.0
elif final_reward < 0:
result = 0.0

change = self.controller.compute_elo_rating_changes(
self.current_elo, result
)
self.change_current_elo(change)
self._stats_reporter.add_stat("Self-play/ELO", self.current_elo)
self.update_elo_ratings(trajectory)

def advance(self) -> None:
"""
Expand Down Expand Up @@ -478,3 +459,31 @@ def subscribe_trajectory_queue(
parsed_behavior_id.brain_name
] = internal_trajectory_queue
self.trainer.subscribe_trajectory_queue(internal_trajectory_queue)

def update_elo_ratings(self, trajectory: Trajectory) -> None:
"""
Updates the ELO ratings based on the outcome of an episode.
This method encapsulates the ELO update logic that was previously
part of the _process_trajectory method.

:param trajectory: Trajectory containing the episode outcome.
"""
if (
trajectory.done_reached
and trajectory.all_group_dones_reached
and not trajectory.interrupted
):
final_reward = (
trajectory.steps[-1].reward + trajectory.steps[-1].group_reward
)
result = 0.5
if final_reward > 0:
result = 1.0
elif final_reward < 0:
result = 0.0

change = self.controller.compute_elo_rating_changes(
self.current_elo, result
)
self.change_current_elo(change)
self._stats_reporter.add_stat("Self-play/ELO", self.current_elo)