diff --git a/cms/grading/steps/__init__.py b/cms/grading/steps/__init__.py index dcc17a3486..fc05b7c923 100644 --- a/cms/grading/steps/__init__.py +++ b/cms/grading/steps/__init__.py @@ -27,6 +27,8 @@ from .trusted import checker_step, extract_outcome_and_text, trusted_step from .whitediff import _WHITES, _white_diff, white_diff_step,\ white_diff_fobj_step +from .realprecision import _EPS, realprecision_diff_step, \ + realprecision_diff_fobj_step, _real_numbers_compare __all__ = [ @@ -43,5 +45,7 @@ # trusted.py "checker_step", "extract_outcome_and_text", "trusted_step", # whitediff.py - "_WHITES", "_white_diff", "white_diff_step", "white_diff_fobj_step" + "_WHITES", "_white_diff", "white_diff_step", "white_diff_fobj_step", + # realprecision.py + "_EPS", "_real_numbers_compare", "realprecision_diff_step", "realprecision_diff_fobj_step" ] diff --git a/cms/grading/steps/realprecision.py b/cms/grading/steps/realprecision.py new file mode 100644 index 0000000000..e4feaa5238 --- /dev/null +++ b/cms/grading/steps/realprecision.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +# Contest Management System - http://cms-dev.github.io/ +# Copyright © 2025 Ron Ryvchin +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""High level functions to perform standardized real-number comparison. + +Policy: +- Tokenization: only fixed-format decimals (no exponent, no inf/nan). + Accepted examples: "12", "12.", "12.34", ".5", "-0.0", "+3.000" + Rejected examples: "1e-3", "nan", "inf", "0x1.8p3" +- Tolerance: 1e-6 absolute OR 1e-6 * max(1, |a|, |b|) relative. +- Pairwise comparison in order in case the number of fixed-format decimals + is the same, otherwise the files are considered different. +""" + +import logging +import re +import typing + +from cms.grading.Sandbox import Sandbox + +from .evaluation import EVALUATION_MESSAGES + + +logger = logging.getLogger(__name__) + + +# Fixed-format decimals only (bytes regex). +_FIXED_DEC_RE = re.compile(rb'[+-]?(?:\d+(?:\.\d*)?|\.\d+)') +_EPS = 1e-6 + + +def _compare_real_pair(a: float, b: float) -> bool: + """Return True if a and b match within absolute/relative tolerance.""" + diff = abs(a - b) + tol = _EPS * max(1.0, abs(a), abs(b)) + return diff <= tol + + +def _parse_fixed(token: bytes) -> float | None: + """Parse a fixed-format decimal token into float; return None on failure.""" + # The regex already excludes exponents/inf/nan; this is defensive. + try: + # Decode strictly ASCII; reject weird Unicode digits. + s = token.decode("ascii", errors="strict") + # float() accepts exponent, but regex guarantees none are present. + return float(s) + except Exception: + return None + + +def _extract_fixed_decimals(stream: typing.BinaryIO) -> list[float]: + """Extract and parse all fixed-format decimal tokens from a binary stream.""" + data = stream.read() + nums: list[float] = [] + for m in _FIXED_DEC_RE.findall(data): + v = _parse_fixed(m) + if v is not None: + nums.append(v) + return nums + + +def _real_numbers_compare( + output: typing.BinaryIO, correct: typing.BinaryIO +) -> bool: + """Compare the two output files. Two files are equal if they have the + same number of real numbers, and all for every integer i, the absolute + or relative difference of real number i of first file and real number i + of second file is smaller or equal to 10^-6. + + output: the first file to compare. + res: the second file to compare. + return: True if the two file are (up to the 10^-6 accuracy) as explained above. + + """ + exp_nums = _extract_fixed_decimals(correct) + act_nums = _extract_fixed_decimals(output) + + if len(exp_nums) != len(act_nums): + return False + + n = len(exp_nums) + + # Pairwise comparisons + for i in range(n): + a, b = exp_nums[i], act_nums[i] + if not _compare_real_pair(a, b): + return False + + return True + + +def realprecision_diff_fobj_step( + output_fobj: typing.BinaryIO, correct_output_fobj: typing.BinaryIO +) -> tuple[float, list[str]]: + """Compare user output and correct output by extracting the fixed + floating point format number, and comparing their values. + + It gives an outcome 1.0 if the output and the reference output have + an absoulte or a relative smaller or equal to 10^-6 and 0.0 if they don't. + Calling this function means that the output file exists. + + output_fobj: file for the user output, opened in binary mode. + correct_output_fobj: file for the correct output, opened in + binary mode. + + return: the outcome as above and a description text. + + """ + if _real_numbers_compare(output_fobj, correct_output_fobj): + return 1.0, [EVALUATION_MESSAGES.get("success").message] + else: + return 0.0, [EVALUATION_MESSAGES.get("wrong").message] + + +def realprecision_diff_step( + sandbox: Sandbox, output_filename: str, correct_output_filename: str +) -> tuple[float, list[str]]: + """Compare user output and correct output by extracting the fixed + floating point format number, and comparing their values. + + It gives an outcome 1.0 if the output and the reference output have + an absoulte or a relative smaller or equal to 10^-6 and 0.0 if they don't + (or if the output doesn't exist). + + sandbox: the sandbox we consider. + output_filename: the filename of user's output in the sandbox. + correct_output_filename: the same with reference output. + + return: the outcome as above and a description text. + + """ + if sandbox.file_exists(output_filename): + with sandbox.get_file(output_filename) as out_file, \ + sandbox.get_file(correct_output_filename) as res_file: + return real_precision_fobj_step(out_file, res_file) + else: + return 0.0, [ + EVALUATION_MESSAGES.get("nooutput").message, output_filename] diff --git a/cms/grading/tasktypes/Batch.py b/cms/grading/tasktypes/Batch.py index d23ab04e51..6ba086f723 100644 --- a/cms/grading/tasktypes/Batch.py +++ b/cms/grading/tasktypes/Batch.py @@ -84,6 +84,7 @@ class Batch(TaskType): # Constants used in the parameter definition. OUTPUT_EVAL_DIFF = "diff" OUTPUT_EVAL_CHECKER = "comparator" + OUTPUT_EVAL_REALPREC = "realprecision" COMPILATION_ALONE = "alone" COMPILATION_GRADER = "grader" @@ -111,7 +112,8 @@ class Batch(TaskType): "output_eval", "", {OUTPUT_EVAL_DIFF: "Outputs compared with white diff", - OUTPUT_EVAL_CHECKER: "Outputs are compared by a comparator"}) + OUTPUT_EVAL_CHECKER: "Outputs are compared by a comparator", + OUTPUT_EVAL_REALPREC: "Outputs compared as real numbers (with precision of 1e-6)"}) ACCEPTED_PARAMETERS = [_COMPILATION, _USE_FILE, _EVALUATION] @@ -181,6 +183,9 @@ def _uses_grader(self) -> bool: def _uses_checker(self) -> bool: return self.output_eval == self.OUTPUT_EVAL_CHECKER + def _uses_realprecision(self) -> bool: + return self.output_eval == self.OUTPUT_EVAL_REALPREC + @staticmethod def _executable_filename(codenames: Iterable[str], language: Language) -> str: """Return the chosen executable name computed from the codenames. @@ -371,6 +376,7 @@ def _evaluate_step(self, job, file_cacher, output_file_params, outcome, text, st file_cacher, job, self.CHECKER_CODENAME if self._uses_checker() else None, + use_realprecision = self._uses_realprecision(), **output_file_params, extra_args=extra_args) # Fill in the job with the results. diff --git a/cms/grading/tasktypes/OutputOnly.py b/cms/grading/tasktypes/OutputOnly.py index 7a5e2e3e00..4d80fe9023 100644 --- a/cms/grading/tasktypes/OutputOnly.py +++ b/cms/grading/tasktypes/OutputOnly.py @@ -57,6 +57,7 @@ class OutputOnly(TaskType): # Constants used in the parameter definition. OUTPUT_EVAL_DIFF = "diff" OUTPUT_EVAL_CHECKER = "comparator" + OUTPUT_EVAL_REALPREC = "realprecision" # Other constants to specify the task type behaviour and parameters. ALLOW_PARTIAL_SUBMISSION = True @@ -66,7 +67,8 @@ class OutputOnly(TaskType): "output_eval", "", {OUTPUT_EVAL_DIFF: "Outputs compared with white diff", - OUTPUT_EVAL_CHECKER: "Outputs are compared by a comparator"}) + OUTPUT_EVAL_CHECKER: "Outputs are compared by a comparator", + OUTPUT_EVAL_REALPREC: "Outputs compared as real numbers (with precision of 1e-6)"}) ACCEPTED_PARAMETERS = [_EVALUATION] @@ -97,6 +99,9 @@ def get_auto_managers(self): def _uses_checker(self) -> bool: return self.output_eval == OutputOnly.OUTPUT_EVAL_CHECKER + def _uses_realprecision(self) -> bool: + return self.output_eval == self.OUTPUT_EVAL_REALPREC + @staticmethod def _get_user_output_filename(job: Job): return OutputOnly.USER_OUTPUT_FILENAME_TEMPLATE % \ @@ -127,6 +132,7 @@ def evaluate(self, job, file_cacher): box_success, outcome, text = eval_output( file_cacher, job, OutputOnly.CHECKER_CODENAME if self._uses_checker() else None, + use_realprecision = self._uses_realprecision(), user_output_digest=job.files[user_output_filename].digest) # Fill in the job with the results. diff --git a/cms/grading/tasktypes/util.py b/cms/grading/tasktypes/util.py index 609d7c5ac6..fedd19c647 100644 --- a/cms/grading/tasktypes/util.py +++ b/cms/grading/tasktypes/util.py @@ -40,7 +40,7 @@ from cms.grading.Sandbox import Sandbox from cms.grading.language import Language from cms.grading.steps import EVALUATION_MESSAGES, checker_step, \ - white_diff_fobj_step + white_diff_fobj_step, realprecision_diff_fobj_step logger = logging.getLogger(__name__) @@ -217,6 +217,7 @@ def eval_output( file_cacher: FileCacher, job: Job, checker_codename: str | None, + use_realprecision: bool = False, user_output_path: str | None = None, user_output_digest: str | None = None, user_output_filename: str = "", @@ -227,7 +228,8 @@ def eval_output( file_cacher: file cacher to use to get files. job: the job triggering this checker run. checker_codename: codename of the checker amongst the manager, - or None to use white diff. + or None to use white diff / real number precision. + use_realprecision: whether we should use real precision comparator. user_output_path: full path of the user output file, None if using the digest (exactly one must be non-None). user_output_digest: digest of the user output file, None if @@ -283,12 +285,13 @@ def eval_output( return success, outcome, text else: + comparator_function = realprecision_diff_fobj_step if use_realprecision else white_diff_fobj_step if user_output_path is not None: user_output_fobj = open(user_output_path, "rb") else: user_output_fobj = file_cacher.get_file(user_output_digest) with user_output_fobj: with file_cacher.get_file(job.output) as correct_output_fobj: - outcome, text = white_diff_fobj_step( + outcome, text = comparator_function( user_output_fobj, correct_output_fobj) return True, outcome, text diff --git a/cmstestsuite/unit_tests/grading/steps/realprecision_test.py b/cmstestsuite/unit_tests/grading/steps/realprecision_test.py new file mode 100644 index 0000000000..70cae3cea2 --- /dev/null +++ b/cmstestsuite/unit_tests/grading/steps/realprecision_test.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +# Contest Management System - http://cms-dev.github.io/ +# Copyright © 2025 Ron Ryvchin +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Tests for whitediff.py.""" + +import unittest +from io import BytesIO + +from cms.grading.steps import _EPS, _real_numbers_compare + +_PREC = 12 +def f(x: float) -> str: + return f"{x:.{_PREC}f}" + +_ACC = 1e-10 +_NOISE = _EPS - _ACC +_DIFF = _EPS + _ACC + +class TestWhiteDiff(unittest.TestCase): + + @staticmethod + def _cmp(s1, s2): + return _real_numbers_compare( + BytesIO(s1.encode("utf-8")), BytesIO(s2.encode("utf-8"))) + + # --- Tokenization ---------------------------------------------------------------- + + def test_no_numbers_equal(self): + self.assertTrue(self._cmp("", "")) + self.assertTrue(self._cmp("Daniel W", "Ron R")) + self.assertTrue(self._cmp("你好", "谢谢")) + + def test_no_diff_one_token_and_whites(self): + self.assertTrue(self._cmp("1.0 ", "1.0")) + self.assertTrue(self._cmp(" 1.0", "1.0")) + self.assertTrue(self._cmp(" The answer is 1.0 thanks", "It should be 1.0 ok?")) + self.assertFalse(self._cmp(" The answer is 1.0 thanks", "It should be 1.5 ok?")) + + def test_no_diff_multiple_tokens_and_whites(self): + self.assertTrue(self._cmp("1\n2\n3", "1 2 3")) + self.assertTrue(self._cmp(" \t 1 \r\n 2 \f 3 \v ", "1 2 3")) + + def test_accepted_formats(self): + self.assertTrue(self._cmp(".5", "0.5")) + self.assertTrue(self._cmp("+3.000", "3")) + self.assertTrue(self._cmp("12.", "12")) + self.assertTrue(self._cmp("-0.0", "0")) + + def test_multiple_numbers_basic(self): + self.assertTrue(self._cmp("1 2.0 3", "1.000 2 3.")) + self.assertFalse(self._cmp("1 2 3", "1 3 2")) + self.assertFalse(self._cmp("1 2.0 3", "1.000 2 3. 4")) + + # --- Absolute accuracy ----------------------------------------------------------- + + def test_absolute_tolerance_pass(self): + self.assertTrue(self._cmp(f(_NOISE), "0")) + self.assertTrue(self._cmp("0", f(-_NOISE))) + a = 0.5 + self.assertTrue(self._cmp(f(a), f(a + _NOISE))) + self.assertTrue(self._cmp(f(a - _NOISE), f(a))) + + def test_absolute_tolerance_fail(self): + self.assertFalse(self._cmp(f(_DIFF), "0")) + self.assertFalse(self._cmp("0", f(-_DIFF))) + a = 0.5 + self.assertFalse(self._cmp(f(a), f(a + _DIFF))) + self.assertFalse(self._cmp(f(a - _DIFF), f(a))) + + # --- Relative accuracy ----------------------------------------------------------- + + def test_relative_tolerance_pass(self): + a = 1 + b = a + _NOISE * a + self.assertTrue(self._cmp(f(a), f(b))) + a = 1000000 + b = a + _NOISE * a + self.assertTrue(self._cmp(f(a), f(b))) + + def test_relative_tolerance_fail(self): + a = 1 + b = a + _DIFF * a + self.assertFalse(self._cmp(f(a), f(b))) + a = 1000000 + b = a + _DIFF * a + self.assertFalse(self._cmp(f(a), f(b))) + + # --- Multiple numbers ------------------------------------------------------------ + + def test_multiple_numbers_tolerance(self): + A = [0.25, 1.0, 2500000.0, -0.75, -3.0, 0.0, 12.5, 0.5] + B = [] + for i, a in enumerate(A): + B.append(a + (1 - 2 * (i % 2)) * _NOISE * max(1.0, abs(a))) + + self.assertTrue(self._cmp(" ".join(map(f, A)), " ".join(map(f, B)))) + C = B.copy() + C[0] = A[0] + _DIFF * max(1.0, abs(A[0])) + self.assertFalse(self._cmp(" ".join(map(f, A)), " ".join(map(f, C)))) + D = B.copy() + D[4] = A[4] - _DIFF * max(1.0, abs(A[4])) + self.assertFalse(self._cmp(" ".join(map(f, A)), " ".join(map(f, D)))) + E = B.copy() + E[7] = A[7] - _DIFF * max(1.0, abs(A[7])) + self.assertFalse(self._cmp(" ".join(map(f, A)), " ".join(map(f, E)))) + +if __name__ == "__main__": + unittest.main() diff --git a/cmstestsuite/unit_tests/grading/tasktypes/BatchTest.py b/cmstestsuite/unit_tests/grading/tasktypes/BatchTest.py index f56935111a..1302864e36 100755 --- a/cmstestsuite/unit_tests/grading/tasktypes/BatchTest.py +++ b/cmstestsuite/unit_tests/grading/tasktypes/BatchTest.py @@ -334,6 +334,75 @@ def assertResultsInJob(self, job): self.assertEqual(job.text, text) self.assertEqual(job.plus, stats) + def test_stdio_realprecision_success(self): + tt, job = self.prepare(["alone", ["", ""], "realprecision"], {"foo": EXE_FOO}) + sandbox = self.expect_sandbox() + + tt.evaluate(job, self.file_cacher) + + # Sandbox created with the correct file cacher and name. + self.Sandbox.assert_called_once_with(self.file_cacher, name="evaluate") + # We need input (with the default filename for redirection) and + # executable copied in the sandbox. + sandbox.create_file_from_storage.assert_has_calls([ + call("foo", "digest of foo", executable=True), + call("input.txt", "digest of input"), + ], any_order=True) + self.assertEqual(sandbox.create_file_from_storage.call_count, 2) + # Evaluation step called with the right arguments, in particular + # redirects, and no (other) writable files. + self.evaluation_step.assert_called_once_with( + sandbox, + fake_evaluation_commands(EVALUATION_COMMAND_1, "foo", "foo"), + 2.5, 123 * 1024 * 1024, + writable_files=[], + stdin_redirect="input.txt", + stdout_redirect="output.txt", + multiprocess=True) + # Check eval_output was called correctly. + self.eval_output.assert_called_once_with( + self.file_cacher, job, None, + use_realprecision=True, user_output_path="/path/0/output.txt", + user_output_filename="", extra_args=None) + # Results put in job and sandbox deleted. + self.assertResultsInJob(job) + sandbox.cleanup.assert_called_once_with(delete=True) + + def test_fileio_realprecision_success(self): + tt, job = self.prepare(["alone", ["myin", "myout"], "realprecision"], + {"foo": EXE_FOO}) + sandbox = self.expect_sandbox() + + tt.evaluate(job, self.file_cacher) + + # Sandbox created with the correct file cacher and name. + self.Sandbox.assert_called_once_with(self.file_cacher, name="evaluate") + # We need input (with the filename specified in the parameters) and + # executable copied in the sandbox. + sandbox.create_file_from_storage.assert_has_calls([ + call("foo", "digest of foo", executable=True), + call("myin", "digest of input"), + ], any_order=True) + self.assertEqual(sandbox.create_file_from_storage.call_count, 2) + # Evaluation step called with the right arguments, in particular + # the specified output is writable. + self.evaluation_step.assert_called_once_with( + sandbox, + fake_evaluation_commands(EVALUATION_COMMAND_1, "foo", "foo"), + 2.5, 123 * 1024 * 1024, + writable_files=["myout"], + stdin_redirect=None, + stdout_redirect=None, + multiprocess=True) + # Check eval_output was called correctly. + self.eval_output.assert_called_once_with( + self.file_cacher, job, None, use_realprecision=True, + user_output_path="/path/0/myout", + user_output_filename="myout", extra_args=None) + # Results put in job and sandbox deleted. + self.assertResultsInJob(job) + sandbox.cleanup.assert_called_once_with(delete=True) + def test_stdio_diff_success(self): tt, job = self.prepare(["alone", ["", ""], "diff"], {"foo": EXE_FOO}) sandbox = self.expect_sandbox() @@ -362,7 +431,8 @@ def test_stdio_diff_success(self): # Check eval_output was called correctly. self.eval_output.assert_called_once_with( self.file_cacher, job, None, - user_output_path="/path/0/output.txt", user_output_filename="", extra_args=None) + use_realprecision=False, user_output_path="/path/0/output.txt", + user_output_filename="", extra_args=None) # Results put in job and sandbox deleted. self.assertResultsInJob(job) sandbox.cleanup.assert_called_once_with(delete=True) @@ -482,7 +552,8 @@ def test_fileio_diff_success(self): multiprocess=True) # Check eval_output was called correctly. self.eval_output.assert_called_once_with( - self.file_cacher, job, None, user_output_path="/path/0/myout", + self.file_cacher, job, None, use_realprecision=False, + user_output_path="/path/0/myout", user_output_filename="myout", extra_args=None) # Results put in job and sandbox deleted. self.assertResultsInJob(job) @@ -498,7 +569,8 @@ def test_stdio_checker_success(self): # We only perform checks for the final eval step (checker). self.eval_output.assert_called_once_with( self.file_cacher, job, "checker", - user_output_path="/path/0/output.txt", user_output_filename="", extra_args=None) + use_realprecision=False, user_output_path="/path/0/output.txt", + user_output_filename="", extra_args=None) # Results put in job and sandbox deleted. self.assertResultsInJob(job) sandbox.cleanup.assert_called_once_with(delete=True) @@ -513,6 +585,7 @@ def test_fileio_checker_success(self): # We only perform checks for the final eval step (checker). self.eval_output.assert_called_once_with( self.file_cacher, job, "checker", + use_realprecision=False, user_output_path="/path/0/myout", user_output_filename="myout", extra_args=None) # Results put in job and sandbox deleted. diff --git a/cmstestsuite/unit_tests/grading/tasktypes/OutputOnlyTest.py b/cmstestsuite/unit_tests/grading/tasktypes/OutputOnlyTest.py index 45172cce16..56269ab41e 100755 --- a/cmstestsuite/unit_tests/grading/tasktypes/OutputOnlyTest.py +++ b/cmstestsuite/unit_tests/grading/tasktypes/OutputOnlyTest.py @@ -67,6 +67,19 @@ def assertResultsInJob(self, job, success, outcome, text, stats): self.assertEqual(job.text, text) self.assertEqual(job.plus, stats) + def test_realprecision_success(self): + tt, job = self.prepare(["realprecision"], { + "output_001.txt": FILE_001, + "output_023.txt": FILE_023 + }) + + tt.evaluate(job, self.file_cacher) + + self.eval_output.assert_called_once_with( + self.file_cacher, job, None, use_realprecision=True, + user_output_digest="digest of 023") + self.assertResultsInJob(job, True, str(OUTCOME), TEXT, {}) + def test_diff_success(self): tt, job = self.prepare(["diff"], { "output_001.txt": FILE_001, @@ -76,7 +89,8 @@ def test_diff_success(self): tt.evaluate(job, self.file_cacher) self.eval_output.assert_called_once_with( - self.file_cacher, job, None, user_output_digest="digest of 023") + self.file_cacher, job, None, use_realprecision=False, + user_output_digest="digest of 023") self.assertResultsInJob(job, True, str(OUTCOME), TEXT, {}) def test_diff_missing_file(self): @@ -100,7 +114,8 @@ def test_diff_failure(self): tt.evaluate(job, self.file_cacher) self.eval_output.assert_called_once_with( - self.file_cacher, job, None, user_output_digest="digest of 023") + self.file_cacher, job, None, use_realprecision=False, + user_output_digest="digest of 023") self.assertResultsInJob(job, False, None, None, None) def test_comparator_success(self): @@ -113,6 +128,7 @@ def test_comparator_success(self): self.eval_output.assert_called_once_with( self.file_cacher, job, "checker", + use_realprecision=False, user_output_digest="digest of 023") self.assertResultsInJob(job, True, str(OUTCOME), TEXT, {})