Drop az_quiz._winner, use az_quiz._outcome instead.

foxik · foxik · commit bf96d4c7432b · 2025-05-10T21:35:44.000+02:00
diff --git a/labs/npfl139/board_games/az_quiz.py b/labs/npfl139/board_games/az_quiz.py
@@ -18,7 +18,7 @@ def __init__(self, randomized=False):
         self._board = np.tri(self.N, dtype=np.int8) - 1
         self._randomized = randomized
         self._to_play = 0
-        self._winner = None
+        self._outcome = None
         self._screen = None
         self._last_action, self._winning_stones = None, None
 
@@ -27,11 +27,11 @@ def clone(self, swap_players=False) -> "AZQuiz":
         if swap_players:
             clone._board = self._SWAP_PLAYERS[self._board + 1]
             clone._to_play = 1 - self._to_play
-            clone._winner = 1 - self._winner if self._winner is not None else None
+            clone._outcome = self._outcome.reverse() if self._outcome is not None else None
         else:
             clone._board[:, :] = self._board
             clone._to_play = self._to_play
-            clone._winner = self._winner
+            clone._outcome = self._outcome
         clone._last_action, clone._winning_stones = self._last_action, self._winning_stones
         return clone
 
@@ -48,16 +48,14 @@ def to_play(self) -> int:
         return self._to_play
 
     def outcome(self, player: int) -> BoardGame.Outcome | None:
-        if self._winner is None:
-            return None
-        return self.Outcome.WIN if self._winner == player else self.Outcome.LOSS
+        return self._outcome if self._outcome is None or player == self._to_play else self._outcome.reverse()
 
     def valid(self, action: int) -> bool:
-        return self._winner is None and action >= 0 and action < self.ACTIONS \
+        return self._outcome is None and action >= 0 and action < self.ACTIONS \
             and self._board[self._ACTION_Y[action], self._ACTION_X[action]] < 2
 
     def valid_actions(self) -> list[int]:
-        return np.nonzero(self._board[self._ACTION_Y, self._ACTION_X] < 2)[0] if self._winner is None else []
+        return np.nonzero(self._board[self._ACTION_Y, self._ACTION_X] < 2)[0] if self._outcome is None else []
 
     def move(self, action: int):
         self._last_action = action
@@ -99,7 +97,7 @@ def _move(self, action, random_value):
             if field >= 2:
                 self._traverse(j, 0, field, edges, visited)
                 if edges.all():
-                    self._winner = field - 2
+                    self._outcome = self.Outcome.WIN if field - 2 == self._to_play else self.Outcome.LOSS
                     self._winning_stones = visited == 1
                 visited += visited > 0
 
diff --git a/lectures/lecture12.md b/lectures/lecture12.md
@@ -4,6 +4,7 @@
 #### Reading: https://ufal.mff.cuni.cz/~straka/courses/npfl139/2425/slides.pdf/npfl139-2425-12.pdf,PDF Slides
 #### Video: https://lectures.ms.mff.cuni.cz/video/rec/npfl139/2425/npfl139-2425-12.mp4, Lecture
 #### Questions: #lecture_12_questions
+#### Lecture assignment: az_quiz_randomized
 
 - MuZero [[Julian Schrittwieser et al.: Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model](https://arxiv.org/abs/1911.08265)]
 - AlphaZero as regularized policy optization [[Jean-Bastien Grill et al.: Monte-Carlo Tree Search as Regularized Policy Optimization](https://arxiv.org/abs/2007.12509)]
diff --git a/tasks/az_quiz_randomized.md b/tasks/az_quiz_randomized.md
@@ -0,0 +1,29 @@
+### Assignment: az_quiz_randomized
+#### Date: Deadline: Jun 30, 22:00
+#### Points: 5 points; either this or `pisqorky` is required for automatically passing the exam
+
+Extend the `az_quiz` assignment to handle the possibility of wrong
+answers. Therefore, when choosing a field (an action), you might not
+claim it; in such a case, the state of the field becomes “failed”. When
+a “failed” field is chosen as an action by a player, then either
+- it is successfully claimed by the player (they “answer correctly”); or
+- if the player “answers incorrectly”, the field is claimed by the opposing
+  player; however, in this case, the original player continue playing
+  (i.e., the players do not alternate in this case).
+
+To instantiate this randomized game variant, either pass `randomized=True`
+to the `npfl139.board_games.AZQuiz`, or use `az_quiz_randomized` as a board
+games (e.g., as the argument to `npfl139.board_games.evaluate` or to
+`npfl139.board_games.BoardGame.from_name`).
+
+Your goal is to propose how to modify the Monte Carlo Tree Search to properly
+handle stochastic MDPs. The information about distribution of possible next
+states is provided by the `AZQuiz.all_moves` method, which returns a list of
+`(probability, az_quiz_instance)` next states (in our environment, there are
+always two possible next states).
+
+Your implementation must be capable of training and achieve at least 90% win
+rate against the simple heuristic. Additionally, part of this assignment is
+to also write us on Piazza (once you pass in ReCodEx) a description of how
+you handle the stochasticity in MCTS; you will get points only after we finish
+the discussion.