diff --git a/tensorflow_probability/python/optimizer/bfgs.py b/tensorflow_probability/python/optimizer/bfgs.py index 4a5287af30..2f455c4705 100644 --- a/tensorflow_probability/python/optimizer/bfgs.py +++ b/tensorflow_probability/python/optimizer/bfgs.py @@ -61,8 +61,11 @@ # `final_position`. If the search converged # the max-norm of this tensor should be # below the tolerance. - 'inverse_hessian_estimate' # A tensor containing the inverse of the - # estimated Hessian. + 'inverse_hessian_estimate', # A tensor containing the inverse of the + # estimated Hessian. + 'scale_initial_inverse_hessian' # Should the initial inverse Hessian + # be rescaled on the first iteration, + # as per Chapter 6 of Nocedal and Wright. ]) @@ -72,6 +75,7 @@ def minimize(value_and_gradients_function, x_tolerance=0, f_relative_tolerance=0, initial_inverse_hessian_estimate=None, + scale_initial_inverse_hessian=True, max_iterations=50, parallel_iterations=1, stopping_condition=None, @@ -290,6 +294,7 @@ def _body(state): tolerance, control_inputs) kwargs['inverse_hessian_estimate'] = initial_inv_hessian + kwargs['scale_initial_inverse_hessian'] = scale_initial_inverse_hessian initial_state = BfgsOptimizerResults(**kwargs) return tf.while_loop( cond=_cond, @@ -355,9 +360,15 @@ def _update_inv_hessian(prev_state, next_state): # Rescale the initial hessian at the first step, as suggested # in Chapter 6 of Numerical Optimization, by Nocedal and Wright. scale_factor = tf.where( - tf.math.equal(prev_state.num_iterations, 0), - normalization_factor / tf.reduce_sum( - tf.math.square(gradient_delta), axis=-1), 1.) + ( + tf.math.equal(prev_state.num_iterations, 0) & + prev_state.scale_initial_inverse_hessian + ), + normalization_factor / tf.reduce_sum( + tf.math.square(gradient_delta), axis=-1 + ), + 1. + ) inverse_hessian_estimate = scale_factor[ ..., tf.newaxis, tf.newaxis] * prev_state.inverse_hessian_estimate diff --git a/tensorflow_probability/python/optimizer/bfgs_test.py b/tensorflow_probability/python/optimizer/bfgs_test.py index e7cc1d1d9e..e9bdb7f680 100644 --- a/tensorflow_probability/python/optimizer/bfgs_test.py +++ b/tensorflow_probability/python/optimizer/bfgs_test.py @@ -427,6 +427,50 @@ def himmelblau(coord): self.assertArrayNear(actual, expected, 1e-5) self.assertEqual(batch_results.num_objective_evaluations, 31) + def test_scale_initial_inverse_hessian(self): + """Tests optional scaling of the initial inverse Hessian estimate. + + Shows that the choice of the option determines the behaviour inside + the BFGS optimisation. + """ + @_make_val_and_grad_fn + def sin_x_times_sin_y(coord): + x, y = coord[0], coord[1] + return tf.math.sin(x) + tf.math.sin(y) + + start = tf.constant((1, -2), dtype=np.float64) + + results = {} + for scale in (True, False): + for max_iter in (1, 2, 50): + results[scale, max_iter] = self.evaluate( + bfgs.minimize( + sin_x_times_sin_y, + initial_position=start, + tolerance=1e-8, + scale_initial_inverse_hessian=scale, + max_iterations=max_iter, + ) + ) + + expected_positions = { + # Positions traced by the optimisation on the first iteration + # are not affected by the choice of `scale_initial_inverse_hessian`. + (True, 1): (-0.62581634, -0.7477782), + (False, 1): (-0.62581634, -0.7477782), + # However, gradient calculations on the first iteration _are_ affected, + # and this affects positions identified on the second iteration. + (True, 2): (-1.70200959, -0.37774139), + (False, 2): (-1.24714478, -0.55028845), + # Both approaches converge to the same maximum eventually (although + # this is not guaranteed, it depends on the exact problem being solved). + (True, 50): (-1.57079633, -1.57079633), + (False, 50): (-1.57079633, -1.57079633), + } + + for key, res in results.items(): + self.assertArrayNear(res.position, expected_positions[key], 1e-6) + def test_data_fitting(self): """Tests MLE estimation for a simple geometric GLM.""" n, dim = 100, 3