Merge pull request #192 from neulab/add_confidence

neubig · web-flow · commit 958dbb287d2d · 2022-03-30T08:53:32.000-04:00
Add confidence interval calc to CLI Former-commit-id: e23c9f9
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/python/black.git
-    rev: 22.1.0
+    rev: 22.3.0 
     hooks:
       - id: black
         files: '\.py$'
diff --git a/explainaboard/explainaboard_main.py b/explainaboard/explainaboard_main.py
@@ -63,6 +63,14 @@ def main():
         help="multiple metrics should be separated by space",
     )
 
+    parser.add_argument(
+        '--conf_value',
+        type=float,
+        required=False,
+        default=0.05,
+        help="the p-value with which to calculate the confidence interval",
+    )
+
     args = parser.parse_args()
 
     dataset = args.dataset
@@ -110,6 +118,7 @@ def main():
         "task_name": task,
         "reload_stat": reload_stat,
         "user_defined_features_configs": loaders[0].user_defined_features_configs,
+        "conf_value": args.conf_value,
     }
     if metric_names is not None:
         metadata["metric_names"] = metric_names
diff --git a/explainaboard/info.py b/explainaboard/info.py
@@ -87,7 +87,7 @@ class SysOutputInfo:
     metric_names: Optional[list[str]] = None
     reload_stat: bool = True
     is_print_case: bool = True
-    is_print_confidence_interval: bool = False
+    conf_value: float = 0.05
     # language : str = "English"
 
     # set later
diff --git a/explainaboard/processors/processor.py b/explainaboard/processors/processor.py
@@ -404,7 +404,7 @@ def get_bucket_performance(
                 bucket_stats = metric_stat.filter(sample_ids)
                 metric_result = metric_func.evaluate_from_stats(
                     bucket_stats,
-                    conf_value=0.05 if sys_info.is_print_confidence_interval else None,
+                    conf_value=sys_info.conf_value,
                 )
 
                 conf_low, conf_high = (
@@ -455,7 +455,7 @@ def get_overall_performance(
         ):
             metric_result = metric_func.evaluate_from_stats(
                 metric_stat,
-                conf_value=0.05 if sys_info.is_print_confidence_interval else None,
+                conf_value=sys_info.conf_value,
             )
 
             conf_low, conf_high = (