Remove _get_best_row_for_scalarized_objective and _get_best_row_for_single_objective (#3751)

esantorella · facebook-github-bot · commit 0b6946c7af9e · 2025-05-08T11:51:39.000-07:00
Summary: Pull Request resolved: #3751 * Remove `_get_best_row_for_scalarized_objective` and `_get_best_row_for_single_objective`, enabled by cutting `get_best_raw_objective_point_with_trial_index` over to use `get_trace_by_arm_pull_from_data` * Add more careful no-data handling for `get_trace_by_arm_pull_from_data`, including for empty data and observations with only some metrics observed Note: `get_best_raw_objective_point_with_trial_index` and `get_trace` are now more similar because they both use `get_trace_by_arm_pull_from_data`. However, they differ in a few aspects such as how they treat multi-objective configs and out-of-design points. I think it would be good if these were closer together, but probably not low-ROI at the moment. Reviewed By: Balandat Differential Revision: D74282536 fbshipit-source-id: 4612cab52ad4e53af85efb54d3d0c18568462b0d
diff --git a/ax/core/tests/test_objective.py b/ax/core/tests/test_objective.py
@@ -39,7 +39,7 @@ def setUp(self) -> None:
 
     def test_Init(self) -> None:
         with self.assertRaisesRegex(UserInputError, "does not specify"):
-            (Objective(metric=self.metrics["m1"]),)
+            Objective(metric=self.metrics["m1"])
         with self.assertRaisesRegex(
             UserInputError, "doesn't match the specified optimization direction"
         ):
diff --git a/ax/service/tests/test_best_point_utils.py b/ax/service/tests/test_best_point_utils.py
@@ -177,6 +177,46 @@ def test_get_hypervolume_trace_of_outcomes_multi_objective(self) -> None:
             self.assertEqual(hvs, [0.0, 2.0, 0.0, 2.0])
 
     def test_get_trace_by_arm_pull_from_data(self) -> None:
+        objective = Objective(metric=Metric("m1"), minimize=False)
+        optimzation_config = OptimizationConfig(
+            objective=objective,
+            outcome_constraints=[
+                OutcomeConstraint(
+                    Metric("m2"), op=ComparisonOp.GEQ, bound=0.0, relative=False
+                )
+            ],
+        )
+
+        with self.subTest("No data"):
+            df = pd.DataFrame()
+            result = get_trace_by_arm_pull_from_data(
+                df=df, optimization_config=optimzation_config
+            )
+            self.assertTrue(
+                result.equals(
+                    pd.DataFrame(columns=["trial_index", "arm_name", "value"])
+                )
+            )
+
+        with self.subTest("Data for wrong metric"):
+            df = pd.DataFrame.from_records(
+                data=[
+                    {
+                        "trial_index": 0,
+                        "arm_name": "0_0",
+                        "metric_name": "wrong",
+                        "mean": 1.0,
+                        "sem": None,
+                    },
+                ]
+            )
+            with self.assertRaisesRegex(
+                ValueError, "Some metrics are not present for all trials and arms"
+            ):
+                result = get_trace_by_arm_pull_from_data(
+                    df=df, optimization_config=optimzation_config
+                )
+
         df = pd.DataFrame.from_records(
             data=[
                 {"trial_index": 0, "arm_name": "0_0", "metric_name": "m1", "mean": 1.0},
@@ -200,9 +240,8 @@ def test_get_trace_by_arm_pull_from_data(self) -> None:
             ]
         ).assign(sem=None)
 
-        objective = Objective(metric=Metric("m1"), minimize=False)
         with self.subTest("Relative optimization config not supported"):
-            optimization_config = OptimizationConfig(
+            rel_optimization_config = OptimizationConfig(
                 objective=objective,
                 outcome_constraints=[
                     OutcomeConstraint(
@@ -217,17 +256,9 @@ def test_get_trace_by_arm_pull_from_data(self) -> None:
                 ValueError, "Relativized optimization config not supported"
             ):
                 get_trace_by_arm_pull_from_data(
-                    df=df, optimization_config=optimization_config
+                    df=df, optimization_config=rel_optimization_config
                 )
 
-        optimzation_config = OptimizationConfig(
-            objective=objective,
-            outcome_constraints=[
-                OutcomeConstraint(
-                    Metric("m2"), op=ComparisonOp.GEQ, bound=0.0, relative=False
-                )
-            ],
-        )
         with self.subTest("Single objective, cumulative"):
             result = get_trace_by_arm_pull_from_data(
                 df=df, optimization_config=optimzation_config, use_cumulative_best=True
@@ -382,13 +413,17 @@ def test_best_raw_objective_point(self) -> None:
             generator_run=GeneratorRun(arms=[Arm(parameters={"x1": 5.0, "x2": 5.0})])
         ).run().complete()
         exp.fetch_data()
-        # pyre-fixme[16]: Optional type has no attribute `clone`.
-        opt_conf = exp.optimization_config.clone()
-        opt_conf.objective.metric._name = "not_branin"
-        with self.assertRaisesRegex(ValueError, "No data has been logged"):
-            get_best_raw_objective_point_with_trial_index(
-                experiment=exp, optimization_config=opt_conf
+
+        with self.subTest("Data present but not for needed metrics"):
+            opt_conf = OptimizationConfig(
+                objective=Objective(metric=get_branin_metric(name="not_branin"))
             )
+            with self.assertRaisesRegex(
+                ValueError, "Some metrics are not present for all trials and arms"
+            ):
+                get_best_raw_objective_point_with_trial_index(
+                    experiment=exp, optimization_config=opt_conf
+                )
 
         # Test constraints work as expected.
         observations = [[1.0, 2.0], [3.0, 4.0], [-5.0, -6.0]]
@@ -446,13 +481,13 @@ def test_best_raw_objective_point_unsatisfiable_relative(self) -> None:
         opt_conf.outcome_constraints[0].relative = True
         opt_conf.outcome_constraints[0].bound = 9999
 
-        with self.assertLogs(logger=best_point_logger, level="WARN") as lg:
+        with self.assertRaisesRegex(
+            DataRequiredError,
+            "Optimization config has relative constraint, but model was not fit"
+            " with status quo.",
+        ):
             get_best_raw_objective_point_with_trial_index(
-                exp, optimization_config=opt_conf
-            )
-            self.assertTrue(
-                any("No status quo provided" in warning for warning in lg.output),
-                msg=lg.output,
+                experiment=exp, optimization_config=opt_conf
             )
 
         exp.status_quo = exp.trials[0].arms[0]
diff --git a/ax/service/utils/best_point.py b/ax/service/utils/best_point.py
@@ -80,6 +80,10 @@ def get_best_raw_objective_point_with_trial_index(
     """Given an experiment, identifies the arm that had the best raw objective,
     based on the data fetched from the experiment.
 
+    Note: This function will error with an invalid configuration. If you would
+    prefer for error logs rather than exceptions, use
+    `get_best_by_raw_objective_with_trial_index`.
+
     Args:
         experiment: Experiment, on which to identify best raw objective arm.
         optimization_config: Optimization config to use in place of the one stored
@@ -108,15 +112,10 @@ def get_best_raw_objective_point_with_trial_index(
     if dat.df.empty:
         raise ValueError("Cannot identify best point if experiment contains no data.")
     if any(oc.relative for oc in optimization_config.all_constraints):
-        if experiment.status_quo is not None:
-            optimization_config = derelativize_opt_config(
-                optimization_config=optimization_config,
-                experiment=experiment,
-            )
-        else:
-            logger.warning(
-                "No status quo provided; relative constraints will be ignored."
-            )
+        optimization_config = derelativize_opt_config(
+            optimization_config=optimization_config,
+            experiment=experiment,
+        )
 
     # Only COMPLETED trials should be considered when identifying the best point
     completed_indices = {
@@ -146,15 +145,22 @@ def get_best_raw_objective_point_with_trial_index(
         raise ValueError("No feasible points are in the search space.")
 
     in_design_df = feasible_df.loc[is_in_design]
+    value_by_arm_pull = get_trace_by_arm_pull_from_data(
+        df=in_design_df,
+        optimization_config=optimization_config,
+        use_cumulative_best=False,
+    )
 
-    objective = optimization_config.objective
-    best_row_helper = (
-        _get_best_row_for_scalarized_objective
-        if isinstance(objective, ScalarizedObjective)
-        else _get_best_row_for_single_objective
+    maximize = isinstance(optimization_config.objective, MultiObjective) or (
+        not optimization_config.objective.minimize
+    )
+    best_row_idx = (
+        value_by_arm_pull["value"].idxmax()
+        if maximize
+        else value_by_arm_pull["value"].idxmin()
     )
-    # pyre-ignore Incompatible parameter type [6]
-    best_row = best_row_helper(df=in_design_df, objective=objective)
+    best_row = value_by_arm_pull.loc[best_row_idx]
+
     best_arm = experiment.arms_by_name[best_row["arm_name"]]
     best_trial_index = int(best_row["trial_index"])
     objective_rows = dat.df.loc[
@@ -321,6 +327,9 @@ def get_best_by_raw_objective_with_trial_index(
     TModelPredictArm is of the form:
         ({metric_name: mean}, {metric_name_1: {metric_name_2: cov_1_2}})
 
+    This is a version of `get_best_raw_objective_point_with_trial_index` that
+    logs errors rather than letting exceptions be raised.
+
     Args:
         experiment: Experiment, on which to identify best raw objective arm.
         optimization_config: Optimization config to use in place of the one stored
@@ -468,51 +477,6 @@ def get_pareto_optimal_parameters(
     return res
 
 
-# NOTE: This function will be removed in the next PR.
-def _get_best_row_for_scalarized_objective(
-    df: pd.DataFrame,
-    objective: ScalarizedObjective,
-) -> pd.Series:
-    df = df.copy()
-    # First, add a weight column, setting 0.0 if the metric is not part
-    # of the objective
-    metric_to_weight = {
-        m.name: objective.weights[i] for i, m in enumerate(objective.metrics)
-    }
-    df["weight"] = df["metric_name"].apply(lambda x: metric_to_weight.get(x) or 0.0)
-    # Now, calculate the weighted linear combination via groupby,
-    # filtering out NaN for missing data
-    df["weighted_mean"] = df["mean"] * df["weight"]
-    groupby_df = (
-        df[["arm_name", "trial_index", "weighted_mean"]]
-        .groupby(["arm_name", "trial_index"], as_index=False)
-        .sum(min_count=1)
-        .dropna()
-    )
-    if groupby_df.empty:
-        raise ValueError("No data has been logged for scalarized objective.")
-    return (
-        groupby_df.loc[groupby_df["weighted_mean"].idxmin()]
-        if objective.minimize
-        else groupby_df.loc[groupby_df["weighted_mean"].idxmax()]
-    )
-
-
-# NOTE: This function will be removed in the next PR.
-def _get_best_row_for_single_objective(
-    df: pd.DataFrame, objective: Objective
-) -> pd.Series:
-    objective_name = objective.metric.name
-    objective_rows = df.loc[df["metric_name"] == objective_name]
-    if objective_rows.empty:
-        raise ValueError(f'No data has been logged for objective "{objective_name}".')
-    return (
-        objective_rows.loc[objective_rows["mean"].idxmin()]
-        if objective.minimize
-        else objective_rows.loc[objective_rows["mean"].idxmax()]
-    )
-
-
 def _is_row_feasible(
     df: pd.DataFrame,
     optimization_config: OptimizationConfig,
@@ -779,6 +743,11 @@ def get_trace_by_arm_pull_from_data(
             "`Derelativize` the optimization config, or use `get_trace`."
         )
 
+    empty_result = pd.DataFrame(columns=["trial_index", "arm_name", "value"])
+
+    if len(df) == 0:
+        return empty_result
+
     # reshape data to wide, using only the metrics in the optimization config
     metrics = list(optimization_config.metrics.keys())
 
@@ -793,6 +762,16 @@ def get_trace_by_arm_pull_from_data(
         .set_index(["trial_index", "arm_name", "metric_name"])["mean"]
         .unstack(level="metric_name")
     )
+    missing_metrics = [
+        m for m in metrics if m not in df_wide.columns or df_wide[m].isnull().any()
+    ]
+    if len(missing_metrics) > 0:
+        raise ValueError(
+            "Some metrics are not present for all trials and arms. The "
+            f"following are missing: {missing_metrics}."
+        )
+    if len(df_wide) == 0:
+        return empty_result
     df_wide["feasible"] = df.groupby(["trial_index", "arm_name"])["row_feasible"].all()
     df_wide.reset_index(inplace=True)