@@ -41,7 +41,7 @@ def __init__(self, task, submission_result):
4141 self .task = task
4242 self .submission_result = submission_result
4343 self .dataset = submission_result .dataset if submission_result else None
44- self .skip_enabled = getattr (task , ' skip_failed_subtask' , True )
44+ self .skip_enabled = getattr (task , " skip_failed_subtask" , False )
4545 self ._subtask_groups = None
4646 self ._failed_subtasks = set ()
4747 self ._skipped_testcases = set ()
@@ -56,16 +56,35 @@ def should_skip_testcase(self, testcase_codename: str) -> bool:
5656 if not self .skip_enabled or not self .dataset :
5757 return False
5858
59- # Only skip for GroupMin and GroupMul score types
6059 score_type = self .dataset .score_type
61- if score_type not in [' GroupMin' , ' GroupMul' ]:
60+ if score_type not in [" GroupMin" , " GroupMul" ]:
6261 return False
6362
63+ if testcase_codename in self ._skipped_testcases :
64+ return True
65+
66+ # Check if any earlier testcase in the same subtask has failed
6467 subtask_idx = self ._get_subtask_for_testcase (testcase_codename )
6568 if subtask_idx is None :
6669 return False
6770
68- return subtask_idx in self ._failed_subtasks
71+ # Check if this subtask has already failed due to an earlier testcase
72+ if subtask_idx in self ._failed_subtasks :
73+ subtask_testcases = self ._get_testcases_in_subtask (subtask_idx )
74+ try :
75+ current_testcase_idx = subtask_testcases .index (testcase_codename )
76+ # Check if any earlier testcase in this subtask has failed
77+ for i in range (current_testcase_idx ):
78+ earlier_testcase = subtask_testcases [i ]
79+ if self ._is_testcase_failed (earlier_testcase ):
80+ logger .info (
81+ f"Skipping testcase { testcase_codename } because earlier testcase { earlier_testcase } failed in subtask { subtask_idx } "
82+ )
83+ return True
84+ except ValueError :
85+ pass
86+
87+ return False
6988
7089 def mark_testcase_failed (self , testcase_codename : str , outcome : float ):
7190 """Mark a testcase as failed and potentially skip remaining testcases in the subtask.
@@ -76,31 +95,52 @@ def mark_testcase_failed(self, testcase_codename: str, outcome: float):
7695 if not self .skip_enabled or not self .dataset :
7796 return
7897
79- # Only handle for GroupMin and GroupMul score types
8098 score_type = self .dataset .score_type
8199 if score_type not in ['GroupMin' , 'GroupMul' ]:
82100 return
83101
84- # Check if this testcase failed (outcome is 0.0 for failed)
102+ # Check if this testcase failed
85103 if outcome > 0.0 :
86104 return
87105
88106 subtask_idx = self ._get_subtask_for_testcase (testcase_codename )
89107 if subtask_idx is None :
108+ logger .warning (f"Could not find subtask for testcase { testcase_codename } " )
90109 return
91110
92111 # Mark this subtask as failed
93112 self ._failed_subtasks .add (subtask_idx )
94113 logger .info (f"Marking subtask { subtask_idx } as failed due to testcase { testcase_codename } " )
95114
96- # Get all testcases in this subtask and mark remaining ones as skipped
115+ # Get all testcases in this subtask in order
97116 subtask_testcases = self ._get_testcases_in_subtask (subtask_idx )
98- for tc_codename in subtask_testcases :
99- if tc_codename != testcase_codename : # Skip the failing testcase itself
100- # Check if this testcase hasn't been evaluated yet
101- if not self ._is_testcase_evaluated (tc_codename ):
102- self ._skipped_testcases .add (tc_codename )
103- logger .info (f"Marking testcase { tc_codename } as skipped in subtask { subtask_idx } " )
117+ logger .info (f"Subtask { subtask_idx } testcases in order: { subtask_testcases } " )
118+
119+ # Find the position of the failing testcase
120+ try :
121+ failing_testcase_idx = subtask_testcases .index (testcase_codename )
122+ logger .info (
123+ f"Failing testcase { testcase_codename } is at position { failing_testcase_idx } in subtask { subtask_idx } "
124+ )
125+ except ValueError :
126+ logger .warning (
127+ f"Failed testcase { testcase_codename } not found in subtask { subtask_idx } "
128+ )
129+ return
130+
131+ # Skip only the testcases that come after the failing one in this subtask
132+ for i in range (failing_testcase_idx + 1 , len (subtask_testcases )):
133+ tc_codename = subtask_testcases [i ]
134+ # Only skip if this testcase hasn't been started yet
135+ if not self ._is_testcase_started (tc_codename ):
136+ self ._skipped_testcases .add (tc_codename )
137+ logger .info (
138+ f"Marking testcase { tc_codename } (position { i } ) as skipped in subtask { subtask_idx } (after failure of { testcase_codename } )"
139+ )
140+ else :
141+ logger .info (
142+ f"Testcase { tc_codename } (position { i } ) already started/completed, not skipping"
143+ )
104144
105145 def get_skipped_testcases (self ) -> Set [str ]:
106146 """Get the set of testcase codenames that should be skipped."""
@@ -126,22 +166,31 @@ def _get_subtask_groups(self) -> Optional[Dict[int, List[str]]]:
126166
127167 self ._subtask_groups = {}
128168 testcase_names = sorted (self .dataset .testcases .keys ())
169+ logger .debug (f"All testcase names in order: { testcase_names } " )
170+ logger .debug (f"Score type parameters: { parameters } " )
129171
130172 for subtask_idx , parameter in enumerate (parameters ):
131173 if len (parameter ) < 2 :
132174 continue
133175
134- max_score , target = parameter [0 ], parameter [1 ]
176+ _ , target = (
177+ parameter [0 ],
178+ parameter [1 ],
179+ )
135180
136181 if isinstance (target , int ):
137- # Number-based grouping: first N testcases
138182 start_idx = sum (param [1 ] for param in parameters [:subtask_idx ] if isinstance (param [1 ], int ))
139183 end_idx = start_idx + target
140184 group_testcases = testcase_names [start_idx :end_idx ]
185+ logger .debug (
186+ f"Subtask { subtask_idx } (number-based): testcases { start_idx } -{ end_idx - 1 } = { group_testcases } "
187+ )
141188 elif isinstance (target , str ):
142- # Regex-based grouping
143189 pattern = re .compile (target )
144190 group_testcases = [tc for tc in testcase_names if pattern .match (tc )]
191+ logger .debug (
192+ f"Subtask { subtask_idx } (regex-based): pattern '{ target } ' = { group_testcases } "
193+ )
145194 else :
146195 continue
147196
@@ -198,3 +247,52 @@ def _is_testcase_evaluated(self, testcase_codename: str) -> bool:
198247 return True
199248
200249 return False
250+
251+ def _is_testcase_started (self , testcase_codename : str ) -> bool :
252+ """Check if a testcase has been started (queued, running, or completed).
253+
254+ This is more comprehensive than _is_testcase_evaluated as it also
255+ checks if the testcase is currently being evaluated.
256+
257+ testcase_codename: The codename of the testcase
258+
259+ Returns: True if the testcase has been started, False otherwise
260+ """
261+ # First check if it's already completed
262+ if self ._is_testcase_evaluated (testcase_codename ):
263+ return True
264+
265+ # For now, we'll use the same logic as _is_testcase_evaluated
266+ # In the future, we could check if the testcase is currently
267+ # In the evaluation queue or being processed
268+ # But since we don't have easy access to the queue state here,
269+ # we will only skip testcases that definitely
270+ # haven't been touched yet.
271+
272+ # TODO: Could be enhanced to check the evaluation service queue
273+ return self ._is_testcase_evaluated (testcase_codename )
274+
275+ def _is_testcase_failed (self , testcase_codename : str ) -> bool :
276+ """Check if a testcase has failed (outcome <= 0.0).
277+
278+ testcase_codename: The codename of the testcase
279+
280+ Returns: True if the testcase failed, False otherwise
281+ """
282+ if not self .submission_result :
283+ return False
284+
285+ for evaluation in self .submission_result .evaluations :
286+ if evaluation .codename == testcase_codename :
287+ try :
288+ outcome = (
289+ float (evaluation .outcome )
290+ if evaluation .outcome != "N/A"
291+ and evaluation .outcome is not None
292+ else 0.0
293+ )
294+ return outcome <= 0.0
295+ except (ValueError , TypeError ):
296+ return True # If we can't parse the outcome, consider it failed
297+
298+ return False # Not evaluated yet, so not failed
0 commit comments