From 5c2e49f59b5928a4a76c04d93d57f42d7ff63765 Mon Sep 17 00:00:00 2001
From: Gareth Williams <gareth.m.j.williams@gmail.com>
Date: Mon, 2 Oct 2023 17:20:45 +0100
Subject: [PATCH] fix-and-test

---
 .../python/optimizer/bfgs.py                  | 21 ++++++---
 .../python/optimizer/bfgs_test.py             | 44 +++++++++++++++++++
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/tensorflow_probability/python/optimizer/bfgs.py b/tensorflow_probability/python/optimizer/bfgs.py
index 4a5287af30..2f455c4705 100644
--- a/tensorflow_probability/python/optimizer/bfgs.py
+++ b/tensorflow_probability/python/optimizer/bfgs.py
@@ -61,8 +61,11 @@
                                # `final_position`. If the search converged
                                # the max-norm of this tensor should be
                                # below the tolerance.
-        'inverse_hessian_estimate'  # A tensor containing the inverse of the
-                                    # estimated Hessian.
+        'inverse_hessian_estimate',  # A tensor containing the inverse of the
+                                     # estimated Hessian.
+        'scale_initial_inverse_hessian'  # Should the initial inverse Hessian
+                                         # be rescaled on the first iteration,
+                                         # as per Chapter 6 of Nocedal and Wright.
     ])
 
 
@@ -72,6 +75,7 @@ def minimize(value_and_gradients_function,
              x_tolerance=0,
              f_relative_tolerance=0,
              initial_inverse_hessian_estimate=None,
+             scale_initial_inverse_hessian=True,
              max_iterations=50,
              parallel_iterations=1,
              stopping_condition=None,
@@ -290,6 +294,7 @@ def _body(state):
         tolerance,
         control_inputs)
     kwargs['inverse_hessian_estimate'] = initial_inv_hessian
+    kwargs['scale_initial_inverse_hessian'] = scale_initial_inverse_hessian
     initial_state = BfgsOptimizerResults(**kwargs)
     return tf.while_loop(
         cond=_cond,
@@ -355,9 +360,15 @@ def _update_inv_hessian(prev_state, next_state):
   # Rescale the initial hessian at the first step, as suggested
   # in Chapter 6 of Numerical Optimization, by Nocedal and Wright.
   scale_factor = tf.where(
-      tf.math.equal(prev_state.num_iterations, 0),
-      normalization_factor / tf.reduce_sum(
-          tf.math.square(gradient_delta), axis=-1), 1.)
+    (
+      tf.math.equal(prev_state.num_iterations, 0) &
+      prev_state.scale_initial_inverse_hessian
+    ),
+    normalization_factor / tf.reduce_sum(
+        tf.math.square(gradient_delta), axis=-1
+    ),
+    1.
+  )
 
   inverse_hessian_estimate = scale_factor[
       ..., tf.newaxis, tf.newaxis] * prev_state.inverse_hessian_estimate
diff --git a/tensorflow_probability/python/optimizer/bfgs_test.py b/tensorflow_probability/python/optimizer/bfgs_test.py
index e7cc1d1d9e..e9bdb7f680 100644
--- a/tensorflow_probability/python/optimizer/bfgs_test.py
+++ b/tensorflow_probability/python/optimizer/bfgs_test.py
@@ -427,6 +427,50 @@ def himmelblau(coord):
       self.assertArrayNear(actual, expected, 1e-5)
     self.assertEqual(batch_results.num_objective_evaluations, 31)
 
+  def test_scale_initial_inverse_hessian(self):
+    """Tests optional scaling of the initial inverse Hessian estimate.
+
+    Shows that the choice of the option determines the behaviour inside
+    the BFGS optimisation.
+    """
+    @_make_val_and_grad_fn
+    def sin_x_times_sin_y(coord):
+      x, y = coord[0], coord[1]
+      return tf.math.sin(x) + tf.math.sin(y)
+
+    start = tf.constant((1, -2), dtype=np.float64)
+
+    results = {}
+    for scale in (True, False):
+      for max_iter in (1, 2, 50):
+        results[scale, max_iter] = self.evaluate(
+          bfgs.minimize(
+            sin_x_times_sin_y,
+            initial_position=start,
+            tolerance=1e-8,
+            scale_initial_inverse_hessian=scale,
+            max_iterations=max_iter,
+          )
+        )
+
+    expected_positions = {
+      # Positions traced by the optimisation on the first iteration
+      # are not affected by the choice of `scale_initial_inverse_hessian`.
+      (True, 1): (-0.62581634, -0.7477782),
+      (False, 1): (-0.62581634, -0.7477782),
+      # However, gradient calculations on the first iteration _are_ affected,
+      # and this affects positions identified on the second iteration.
+      (True, 2): (-1.70200959, -0.37774139),
+      (False, 2): (-1.24714478, -0.55028845),
+      # Both approaches converge to the same maximum eventually (although
+      # this is not guaranteed, it depends on the exact problem being solved).
+      (True, 50): (-1.57079633, -1.57079633),
+      (False, 50): (-1.57079633, -1.57079633),
+    }
+
+    for key, res in results.items():
+      self.assertArrayNear(res.position, expected_positions[key], 1e-6)
+
   def test_data_fitting(self):
     """Tests MLE estimation for a simple geometric GLM."""
     n, dim = 100, 3