diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md index c0e2a54540..c42ce4e423 100644 --- a/STYLE_GUIDE.md +++ b/STYLE_GUIDE.md @@ -187,8 +187,8 @@ they supersede all previous conventions. 1. Submodule names should be singular, except where they overlap to TF. Justification: Having plural looks strange in user code, ie, - tf.optimizer.Foo reads nicer than tf.optimizers.Foo since submodules are - only used to access a single, specific thing (at a time). + tf.optimizer.Foo reads nicer than tf.keras.optimizers.Foo since submodules + are only used to access a single, specific thing (at a time). 1. Use `tf.newaxis` rather than `None` to `tf.expand_dims`. diff --git a/SUBSTRATES.md b/SUBSTRATES.md index e926007a4f..35ccadf034 100644 --- a/SUBSTRATES.md +++ b/SUBSTRATES.md @@ -75,11 +75,11 @@ vmap, etc.), we will special-case using an `if JAX_MODE:` block. tests, TFP impl, etc), with `tfp.math.value_and_gradient` or similar. Then, we can special-case `JAX_MODE` inside the body of `value_and_gradient`. -* __`tf.Variable`, `tf.optimizers.Optimizer`__ +* __`tf.Variable`, `tf.keras.optimizers.Optimizer`__ TF provides a `Variable` abstraction so that graph functions may modify - state, including using the TF `Optimizer` subclasses like `Adam`. JAX, in - contrast, operates only on pure functions. In general, TFP is fairly + state, including using the Keras `Optimizer` subclasses like `Adam`. JAX, + in contrast, operates only on pure functions. In general, TFP is fairly functional (e.g. `tfp.optimizer.lbfgs_minimize`), but in some cases (e.g. `tfp.vi.fit_surrogate_posterior`, `tfp.optimizer.StochasticGradientLangevinDynamics`) we have felt the diff --git a/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Latent_Variable_Model.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Latent_Variable_Model.ipynb index 352461a31c..8ae554c36d 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Latent_Variable_Model.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Latent_Variable_Model.ipynb @@ -345,7 +345,7 @@ " unconstrained_observation_noise,\n", " latent_index_points]\n", "\n", - "optimizer = tf.optimizers.Adam(learning_rate=1.0)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=1.0)\n", "\n", "@tf.function(autograph=False, jit_compile=True)\n", "def train_model():\n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb index 2a86903c1e..af1b67a7ec 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb @@ -541,7 +541,7 @@ "source": [ "# Now we optimize the model parameters.\n", "num_iters = 1000\n", - "optimizer = tf.optimizers.Adam(learning_rate=.01)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=.01)\n", "\n", "# Use `tf.function` to trace the loss for more efficient evaluation.\n", "@tf.function(autograph=False, jit_compile=False)\n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Model_Variational_Inference.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Model_Variational_Inference.ipynb index b60c89bfe6..874d6fcb97 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Model_Variational_Inference.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Model_Variational_Inference.ipynb @@ -800,7 +800,7 @@ }, "outputs": [], "source": [ - "optimizer = tf.optimizers.Adam(learning_rate=1e-2)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)\n", "\n", "losses = tfp.vi.fit_surrogate_posterior(\n", " target_log_prob_fn, \n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Models.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Models.ipynb index 81a7bd6c27..d9fb7b6b5e 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Models.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Linear_Mixed_Effects_Models.ipynb @@ -743,7 +743,7 @@ " previous_kernel_results=kernel_results)\n", " return next_state, next_kernel_results\n", "\n", - "optimizer = tf.optimizers.Adam(learning_rate=.01)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=.01)\n", "\n", "# Set up M-step (gradient descent).\n", "@tf.function(autograph=False, jit_compile=True)\n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Multiple_changepoint_detection_and_Bayesian_model_selection.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Multiple_changepoint_detection_and_Bayesian_model_selection.ipynb index 6c2139f913..e41f6fe90a 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Multiple_changepoint_detection_and_Bayesian_model_selection.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Multiple_changepoint_detection_and_Bayesian_model_selection.ipynb @@ -317,7 +317,7 @@ "\n", "losses = tfp.math.minimize(\n", " lambda: -log_prob(),\n", - " optimizer=tf.optimizers.Adam(learning_rate=0.1),\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),\n", " num_steps=100)\n", "plt.plot(losses)\n", "plt.ylabel('Negative log marginal likelihood')" @@ -740,7 +740,7 @@ "source": [ "losses = tfp.math.minimize(\n", " lambda: -log_prob(),\n", - " optimizer=tf.optimizers.Adam(0.1),\n", + " optimizer=tf.keras.optimizers.Adam(0.1),\n", " num_steps=100)\n", "plt.plot(losses)\n", "plt.ylabel('Negative log marginal likelihood')" diff --git a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_Regression.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_Regression.ipynb index 0fea808da2..f90231691d 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_Regression.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_Regression.ipynb @@ -289,7 +289,7 @@ "])\n", "\n", "# Do inference.\n", - "model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", + "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", "model.fit(x, y, epochs=1000, verbose=False);\n", "\n", "# Profit.\n", @@ -391,7 +391,7 @@ "])\n", "\n", "# Do inference.\n", - "model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", + "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", "model.fit(x, y, epochs=1000, verbose=False);\n", "\n", "# Profit.\n", @@ -540,7 +540,7 @@ "])\n", "\n", "# Do inference.\n", - "model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", + "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", "model.fit(x, y, epochs=1000, verbose=False);\n", "\n", "# Profit.\n", @@ -650,7 +650,7 @@ "])\n", "\n", "# Do inference.\n", - "model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", + "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=negloglik)\n", "model.fit(x, y, epochs=1000, verbose=False);\n", "\n", "# Profit.\n", @@ -806,7 +806,7 @@ "batch_size = 32\n", "loss = lambda y, rv_y: rv_y.variational_loss(\n", " y, kl_weight=np.array(batch_size, x.dtype) / x.shape[0])\n", - "model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=loss)\n", + "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=loss)\n", "model.fit(x, y, batch_size=batch_size, epochs=1000, verbose=False)\n", "\n", "# Profit.\n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_VAE.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_VAE.ipynb index 063a7041d7..71cd8347ed 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_VAE.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_VAE.ipynb @@ -434,7 +434,7 @@ "source": [ "negloglik = lambda x, rv_x: -rv_x.log_prob(x)\n", "\n", - "vae.compile(optimizer=tf.optimizers.Adam(learning_rate=1e-3),\n", + "vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),\n", " loss=negloglik)\n", "\n", "_ = vae.fit(train_dataset,\n", diff --git a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_PCA.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_PCA.ipynb index f3c38dc8a5..0de23fb122 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_PCA.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_PCA.ipynb @@ -337,7 +337,7 @@ "target_log_prob_fn = lambda w, z: model.log_prob((w, z, x_train))\n", "losses = tfp.math.minimize(\n", " lambda: -target_log_prob_fn(w, z),\n", - " optimizer=tf.optimizers.Adam(learning_rate=0.05),\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.05),\n", " num_steps=200)" ] }, @@ -479,7 +479,7 @@ "losses = tfp.vi.fit_surrogate_posterior(\n", " target_log_prob_fn,\n", " surrogate_posterior=surrogate_posterior,\n", - " optimizer=tf.optimizers.Adam(learning_rate=0.05),\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.05),\n", " num_steps=200)" ] }, diff --git a/tensorflow_probability/examples/jupyter_notebooks/STS_approximate_inference_for_models_with_non_Gaussian_observations.ipynb b/tensorflow_probability/examples/jupyter_notebooks/STS_approximate_inference_for_models_with_non_Gaussian_observations.ipynb index 7316016f68..6c86b1969b 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/STS_approximate_inference_for_models_with_non_Gaussian_observations.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/STS_approximate_inference_for_models_with_non_Gaussian_observations.ipynb @@ -660,7 +660,7 @@ "t0 = time.time()\n", "losses = tfp.vi.fit_surrogate_posterior(pinned_model.unnormalized_log_prob,\n", " surrogate_posterior,\n", - " optimizer=tf.optimizers.Adam(0.1),\n", + " optimizer=tf.keras.optimizers.Adam(0.1),\n", " num_steps=num_variational_steps)\n", "t1 = time.time()\n", "print(\"Inference ran in {:.2f}s.\".format(t1-t0))" diff --git a/tensorflow_probability/examples/jupyter_notebooks/TFP_Release_Notebook_0_12_1.ipynb b/tensorflow_probability/examples/jupyter_notebooks/TFP_Release_Notebook_0_12_1.ipynb index 85728d1589..8bbd6eb75e 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/TFP_Release_Notebook_0_12_1.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/TFP_Release_Notebook_0_12_1.ipynb @@ -1237,7 +1237,7 @@ "\r\n", "asvi_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,\r\n", " asvi_surrogate_posterior,\r\n", - " optimizer=tf.optimizers.Adam(learning_rate=0.1),\r\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),\r\n", " num_steps=500)\r\n", "logging.getLogger('tensorflow').setLevel(logging.NOTSET)" ] @@ -1255,7 +1255,7 @@ "\r\n", "factored_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,\r\n", " factored_surrogate_posterior,\r\n", - " optimizer=tf.optimizers.Adam(learning_rate=0.1),\r\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),\r\n", " num_steps=500)" ] }, diff --git a/tensorflow_probability/examples/jupyter_notebooks/Variational_Inference_and_Joint_Distributions.ipynb b/tensorflow_probability/examples/jupyter_notebooks/Variational_Inference_and_Joint_Distributions.ipynb index 74a15b0a62..604d7c8663 100644 --- a/tensorflow_probability/examples/jupyter_notebooks/Variational_Inference_and_Joint_Distributions.ipynb +++ b/tensorflow_probability/examples/jupyter_notebooks/Variational_Inference_and_Joint_Distributions.ipynb @@ -512,7 +512,7 @@ } ], "source": [ - "optimizer = tf.optimizers.Adam(learning_rate=1e-2)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)\n", "mvn_loss = tfp.vi.fit_surrogate_posterior(\n", " target_model.unnormalized_log_prob,\n", " surrogate_posterior,\n", @@ -706,7 +706,7 @@ } ], "source": [ - "optimizer=tf.optimizers.Adam(learning_rate=1e-2)\n", + "optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2)\n", "iaf_loss = tfp.vi.fit_surrogate_posterior(\n", " target_model.unnormalized_log_prob,\n", " iaf_surrogate_posterior,\n", @@ -830,7 +830,7 @@ " mean_field_scale # apply the block matrix transformation to the standard Normal distribution\n", " ]))\n", "\n", - "optimizer=tf.optimizers.Adam(learning_rate=1e-2)\n", + "optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2)\n", "mean_field_loss = tfp.vi.fit_surrogate_posterior(\n", " target_model.unnormalized_log_prob,\n", " mean_field_surrogate_posterior,\n", diff --git a/tensorflow_probability/python/bijectors/masked_autoregressive.py b/tensorflow_probability/python/bijectors/masked_autoregressive.py index fca576dadd..e948d3c63c 100644 --- a/tensorflow_probability/python/bijectors/masked_autoregressive.py +++ b/tensorflow_probability/python/bijectors/masked_autoregressive.py @@ -664,7 +664,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.optimizers.Adam(), + model.compile(optimizer=tf.keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 25 @@ -718,7 +718,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): x_, bijector_kwargs={'conditional_input': c_}) model = tfk.Model([x_, c_], log_prob_) - model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1), + model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), loss=lambda _, log_prob: -log_prob) batch_size = 25 @@ -780,7 +780,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.optimizers.Adam(), + model.compile(optimizer=tf.keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 10 @@ -838,7 +838,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.optimizers.Adam(), + model.compile(optimizer=tf.keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 10 diff --git a/tensorflow_probability/python/distributions/gaussian_process.py b/tensorflow_probability/python/distributions/gaussian_process.py index 8ec7f076ad..6c30c6a69e 100644 --- a/tensorflow_probability/python/distributions/gaussian_process.py +++ b/tensorflow_probability/python/distributions/gaussian_process.py @@ -221,7 +221,7 @@ class GaussianProcess( gp = tfd.GaussianProcess(kernel, observed_index_points) - optimizer = tf.optimizers.Adam() + optimizer = tf.keras.optimizers.Adam() @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/gaussian_process_regression_model.py b/tensorflow_probability/python/distributions/gaussian_process_regression_model.py index 9beb6c12cb..84f9b4966b 100644 --- a/tensorflow_probability/python/distributions/gaussian_process_regression_model.py +++ b/tensorflow_probability/python/distributions/gaussian_process_regression_model.py @@ -190,7 +190,7 @@ class GaussianProcessRegressionModel( index_points=observation_index_points, observation_noise_variance=observation_noise_variance) - optimizer = tf.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) + optimizer = tf.keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/student_t_process.py b/tensorflow_probability/python/distributions/student_t_process.py index 7441b4cb38..6395474f99 100644 --- a/tensorflow_probability/python/distributions/student_t_process.py +++ b/tensorflow_probability/python/distributions/student_t_process.py @@ -226,7 +226,7 @@ class StudentTProcess(distribution.AutoCompositeTensorDistribution): tp = tfd.StudentTProcess(3., kernel, observed_index_points) - optimizer = tf.optimizers.Adam() + optimizer = tf.keras.optimizers.Adam() @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/variational_gaussian_process.py b/tensorflow_probability/python/distributions/variational_gaussian_process.py index e10d4442d1..ae06d41631 100644 --- a/tensorflow_probability/python/distributions/variational_gaussian_process.py +++ b/tensorflow_probability/python/distributions/variational_gaussian_process.py @@ -558,7 +558,7 @@ class VariationalGaussianProcess(gaussian_process.GaussianProcess, # For training, we use some simplistic numpy-based minibatching. batch_size = 64 - optimizer = tf.optimizers.Adam(learning_rate=.1) + optimizer = tf.keras.optimizers.Adam(learning_rate=.1) @tf.function def optimize(x_train_batch, y_train_batch): @@ -670,7 +670,7 @@ def optimize(x_train_batch, y_train_batch): # For training, we use some simplistic numpy-based minibatching. batch_size = 64 - optimizer = tf.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) + optimizer = tf.keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) @tf.function def optimize(x_train_batch, y_train_batch): diff --git a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py index ef0a9656a2..baf0ced5f5 100644 --- a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py +++ b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py @@ -107,7 +107,7 @@ def make_distribution_bijector(distribution, name='make_distribution_bijector'): pinned_model) _ = tfp.vi.fit_surrogate_posterior(pinned_model.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(0.01), + optimizer=tf.keras.optimizers.Adam(0.01), num_steps=200) ``` diff --git a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py index 731d4953b2..83a86b670e 100644 --- a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py +++ b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py @@ -205,7 +205,7 @@ def model_with_funnel(): optimization.fit_surrogate_posterior( pinned_model.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(0.01), + optimizer=tf.keras.optimizers.Adam(0.01), sample_size=10, num_steps=1) bijector = ( diff --git a/tensorflow_probability/python/experimental/distributions/importance_resample.py b/tensorflow_probability/python/experimental/distributions/importance_resample.py index 93a46634c0..8acc7bcecd 100644 --- a/tensorflow_probability/python/experimental/distributions/importance_resample.py +++ b/tensorflow_probability/python/experimental/distributions/importance_resample.py @@ -142,7 +142,7 @@ def target_log_prob_fn(x): importance_weighted_losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn, surrogate_posterior=proposal_distribution, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=200, importance_sample_size=importance_sample_size) approximate_posterior = tfed.ImportanceResample( @@ -167,7 +167,7 @@ def target_log_prob_fn(x): proposal_distribution=proposal_distribution, target_log_prob_fn=target_log_prob_fn, importance_sample_size=importance_sample_size), - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=200) ``` diff --git a/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py b/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py index 2cb2c68731..ff685713f1 100644 --- a/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py +++ b/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py @@ -246,7 +246,7 @@ def target_log_prob_fn(loc, scale): pulled_back_shape) vars = tf.nest.map_structure(tf.Variable, uniform_init) - opt = tf.optimizers.Adam(.01) + opt = tf.keras.optimizers.Adam(.01) @tf.function(autograph=False) def one_step(): diff --git a/tensorflow_probability/python/experimental/nn/affine_layers.py b/tensorflow_probability/python/experimental/nn/affine_layers.py index f07aca03dd..66ca98b72d 100644 --- a/tensorflow_probability/python/experimental/nn/affine_layers.py +++ b/tensorflow_probability/python/experimental/nn/affine_layers.py @@ -206,7 +206,7 @@ def loss_fn(): kl = bnn.extra_loss / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() diff --git a/tensorflow_probability/python/experimental/nn/affine_layers_test.py b/tensorflow_probability/python/experimental/nn/affine_layers_test.py index 91ab67de86..10682d0091 100644 --- a/tensorflow_probability/python/experimental/nn/affine_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/affine_layers_test.py @@ -87,7 +87,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers.py b/tensorflow_probability/python/experimental/nn/convolutional_layers.py index 143a33129f..4255bcbb7a 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers.py @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss # Already normalized via `penalty_weight` arg. loss = nll + kl return loss, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py index 9fd5e2e962..1fb3b95337 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py @@ -79,7 +79,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py index 48a7108d04..5ca655cd3c 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss # Already normalized via `penalty_weight` arg. loss = nll + kl return loss, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py index 93b5d987c5..4ff83235f1 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py @@ -78,7 +78,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py index 948ab47000..8c6b3f3288 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() diff --git a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py index e7c166644d..86e4018c52 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py @@ -78,7 +78,7 @@ def loss_fn(): kl = tfn.losses.compute_extra_loss(bnn) / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.optimizers.Adam() + opt = tf.keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb b/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb index d43067ed41..9ee6a94450 100644 --- a/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb @@ -448,7 +448,7 @@ " loss, (nll, kl), _ = compute_loss_bnn(x, y)\n", " return loss, (nll, kl)\n", "\n", - "opt_bnn = tf.optimizers.Adam(learning_rate=0.003)\n", + "opt_bnn = tf.keras.optimizers.Adam(learning_rate=0.003)\n", " \n", "fit_bnn = tfn.util.make_fit_op(\n", " train_loss_bnn,\n", @@ -1280,7 +1280,7 @@ " nll, _ = compute_loss_dnn(x, y)\n", " return nll, None\n", "\n", - "opt_dnn = tf.optimizers.Adam(learning_rate=0.003)\n", + "opt_dnn = tf.keras.optimizers.Adam(learning_rate=0.003)\n", " \n", "fit_dnn = tfn.util.make_fit_op(\n", " train_loss_dnn,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb b/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb index 4f95187252..dc5a30fdd4 100644 --- a/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb @@ -475,7 +475,7 @@ " beta=beta,\n", " seed=seedstream)\n", "\n", - "opt = tf.optimizers.Adam(lr)\n", + "opt = tf.keras.optimizers.Adam(lr)\n", "train_op = tfn.util.make_fit_op(\n", " loss_fn=loss_fn, optimizer=opt,\n", " trainable_variables=loss_fn.trainable_variables,\n", @@ -675,7 +675,7 @@ " beta=beta,\n", " seed=seedstream)\n", "\n", - " opt = tf.optimizers.Adam(lr)\n", + " opt = tf.keras.optimizers.Adam(lr)\n", " train_op = tfn.util.make_fit_op(\n", " loss_fn=loss_fn, optimizer=opt,\n", " trainable_variables=loss_fn.trainable_variables,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb b/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb index 36bf97c7a9..ce5f50d62a 100644 --- a/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb @@ -380,7 +380,7 @@ " loss, (nll, kl), _ = compute_loss(x)\n", " return loss, (nll, kl)\n", "\n", - "opt = tf.optimizers.Adam(learning_rate=1e-3)\n", + "opt = tf.keras.optimizers.Adam(learning_rate=1e-3)\n", "\n", "fit = tfn.util.make_fit_op(\n", " loss,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb b/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb index 11cef0a914..3c3581c392 100644 --- a/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb @@ -390,7 +390,7 @@ " loss, (nll, kl), _ = compute_loss(x, y, beta=0.075)\n", " return loss, (nll, kl)\n", "\n", - "opt = tf.optimizers.Adam(learning_rate=1e-3, decay=0.00005)\n", + "opt = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=0.00005)\n", "\n", "fit = tfn.util.make_fit_op(\n", " loss,\n", diff --git a/tensorflow_probability/python/experimental/nn/util/utils.py b/tensorflow_probability/python/experimental/nn/util/utils.py index 1e60503682..cde61fe0d9 100644 --- a/tensorflow_probability/python/experimental/nn/util/utils.py +++ b/tensorflow_probability/python/experimental/nn/util/utils.py @@ -249,7 +249,7 @@ def make_fit_op(loss_fn, optimizer, trainable_variables, loss_fn: Python `callable` which returns the pair `loss` (`tf.Tensor`) and any other second result such that `tf.nest.map_structure(tf.convert_to_tensor, other)` will succeed. - optimizer: `tf.optimizers.Optimizer`-like instance which has members + optimizer: `tf.keras.optimizers.Optimizer`-like instance which has members `gradient` and `apply_gradients`. trainable_variables: `tf.nest.flatten`-able structure of `tf.Variable` instances. diff --git a/tensorflow_probability/python/experimental/util/trainable.py b/tensorflow_probability/python/experimental/util/trainable.py index d668ffae8f..84c5d5f833 100644 --- a/tensorflow_probability/python/experimental/util/trainable.py +++ b/tensorflow_probability/python/experimental/util/trainable.py @@ -185,7 +185,7 @@ def _make_trainable(cls, model = tfp.util.make_trainable(tfd.Normal) losses = tfp.math.minimize( lambda: -model.log_prob(samples), - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=200) print('Fit Normal distribution with mean {} and stddev {}'.format( model.mean(), diff --git a/tensorflow_probability/python/experimental/util/trainable_test.py b/tensorflow_probability/python/experimental/util/trainable_test.py index e9ed422207..68b3c1c0ef 100644 --- a/tensorflow_probability/python/experimental/util/trainable_test.py +++ b/tensorflow_probability/python/experimental/util/trainable_test.py @@ -198,7 +198,7 @@ def test_docstring_example_normal(self): normal.Normal, seed=test_util.test_seed(sampler_type='stateless')) losses = minimize( lambda: -model.log_prob(samples), - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=200) self.evaluate(tf1.global_variables_initializer()) self.evaluate(losses) diff --git a/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py b/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py index 9f6b4e0e24..596d0c4c59 100644 --- a/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py +++ b/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py @@ -497,7 +497,7 @@ def model_fn(): target_log_prob_fn, surrogate_posterior=surrogate_posterior, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) # After optimization, samples from the surrogate will approximate diff --git a/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py b/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py index 9d94e3dcfd..61160fc595 100644 --- a/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py +++ b/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py @@ -239,7 +239,7 @@ def test_fitting_surrogate_posterior(self, dtype): target_log_prob, surrogate_posterior, num_steps=3, # Don't optimize to completion. - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=5) # Compute posterior statistics. diff --git a/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py b/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py index 7c6647c3a4..6b189120e6 100644 --- a/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py +++ b/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py @@ -153,7 +153,7 @@ def model_fn(): lambda rate, concentration: model.log_prob([rate, concentration, y]), surrogate_posterior=surrogate_posterior, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) # After optimization, samples from the surrogate will approximate @@ -350,7 +350,7 @@ def model_fn(): target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) ``` """ @@ -532,7 +532,7 @@ def model_fn(): target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) ``` @@ -728,7 +728,7 @@ def build_split_flow_surrogate_posterior( target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) ``` diff --git a/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py b/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py index b6298f6255..f215928b0b 100644 --- a/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py +++ b/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py @@ -131,7 +131,7 @@ def _test_fitting(self, model, surrogate_posterior): lambda rate, concentration: model.log_prob((rate, concentration, y)), surrogate_posterior, num_steps=5, # Don't optimize to completion. - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), sample_size=10) # Compute posterior statistics. diff --git a/tensorflow_probability/python/internal/trainable_state_util_test.py b/tensorflow_probability/python/internal/trainable_state_util_test.py index aeb374c037..cf0ff57c8b 100644 --- a/tensorflow_probability/python/internal/trainable_state_util_test.py +++ b/tensorflow_probability/python/internal/trainable_state_util_test.py @@ -347,7 +347,7 @@ def test_fitting_example(self): trainable_dist = build_trainable_normal( shape=[], seed=test_util.test_seed(sampler_type='stateless')) - optimizer = tf.optimizers.Adam(1.0) + optimizer = tf.keras.optimizers.Adam(1.0) # Find the maximum likelihood distribution given observed data. x_observed = [3., -2., 1.7] losses = minimize( diff --git a/tensorflow_probability/python/math/minimize.py b/tensorflow_probability/python/math/minimize.py index 8fa2f295b6..12f23cbae1 100644 --- a/tensorflow_probability/python/math/minimize.py +++ b/tensorflow_probability/python/math/minimize.py @@ -410,7 +410,7 @@ def minimize_stateless(loss_fn, def _make_stateful_optimizer_step_fn(loss_fn, optimizer, trainable_variables): - """Constructs a single step of a stateful (`tf.optimizers`) optimizer.""" + """Constructs a single step of a stateful (`tf.keras.optimizers`) optimizer.""" @tf.function(autograph=False) def optimizer_step(parameters, @@ -460,8 +460,8 @@ def minimize(loss_fn, `tfp.random.sanitize_seed`). num_steps: Python `int` maximum number of steps to run the optimizer. optimizer: Optimizer instance to use. This may be a TF1-style - `tf.train.Optimizer`, TF2-style `tf.optimizers.Optimizer`, or any Python - object that implements `optimizer.apply_gradients(grads_and_vars)`. + `tf.train.Optimizer`, TF2-style `tf.keras.optimizers.Optimizer`, or any + Python object that implements `optimizer.apply_gradients(grads_and_vars)`. convergence_criterion: Optional instance of `tfp.optimizer.convergence_criteria.ConvergenceCriterion` representing a criterion for detecting convergence. If `None`, @@ -528,9 +528,10 @@ def minimize(loss_fn, ```python x = tf.Variable(0.) loss_fn = lambda: (x - 5.)**2 - losses = tfp.math.minimize(loss_fn, - num_steps=100, - optimizer=tf.optimizers.Adam(learning_rate=0.1)) + losses = tfp.math.minimize( + loss_fn, + num_steps=100, + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1)) # In TF2/eager mode, the optimization runs immediately. print("optimized value is {} with loss {}".format(x, losses[-1])) @@ -552,7 +553,9 @@ def minimize(loss_fn, ```python losses = tfp.math.minimize( - loss_fn, num_steps=1000, optimizer=tf.optimizers.Adam(learning_rate=0.1), + loss_fn, + num_steps=1000, + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), convergence_criterion=( tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01))) ``` @@ -574,7 +577,7 @@ def minimize(loss_fn, trace_fn = lambda traceable_quantities: { 'loss': traceable_quantities.loss, 'x': x} trace = tfp.math.minimize(loss_fn, num_steps=100, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), trace_fn=trace_fn) print(trace['loss'].shape, # => [100] trace['x'].shape) # => [100] @@ -594,7 +597,7 @@ def minimize(loss_fn, 'loss': traceable_quantities.loss, 'has_converged': traceable_quantities.has_converged} trace = tfp.math.minimize(loss_fn, num_steps=100, - optimizer=tf.optimizers.Adam(0.1),, + optimizer=tf.keras.optimizers.Adam(0.1),, trace_fn=trace_fn, convergence_criterion=( tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01))) diff --git a/tensorflow_probability/python/mcmc/hmc.py b/tensorflow_probability/python/mcmc/hmc.py index 9019f8d0dc..8f4f9d5a60 100644 --- a/tensorflow_probability/python/mcmc/hmc.py +++ b/tensorflow_probability/python/mcmc/hmc.py @@ -308,7 +308,7 @@ def make_response_likelihood(w, x): log_sigma = tf.Variable(0., dtype=dtype, name='log_sigma') - optimizer = tf.optimizers.SGD(learning_rate=0.01) + optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) @tf.function def mcem_iter(weights_chain_start, step_size): diff --git a/tensorflow_probability/python/mcmc/hmc_test.py b/tensorflow_probability/python/mcmc/hmc_test.py index 5aa88ee549..464e655f41 100644 --- a/tensorflow_probability/python/mcmc/hmc_test.py +++ b/tensorflow_probability/python/mcmc/hmc_test.py @@ -997,7 +997,7 @@ def test_mcem_converges(self): sigma = deferred_tensor.TransformedVariable( name='sigma', initial_value=np.array(1, dtype), bijector=exp.Exp()) - optimizer = tf.optimizers.SGD(learning_rate=0.01) + optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) # TODO(b/144045420): eliminate the need for this tf.function decorator. The # reason it was added was that the test code is written to work in both diff --git a/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py b/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py index 401a3d23f7..d1dc41f262 100644 --- a/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py +++ b/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py @@ -44,7 +44,7 @@ def test_stochastic_optimization(self): trained_dist = normal.Normal(locs, scales) target_dist = normal.Normal(loc=-0.4, scale=1.2) - optimizer = tf.optimizers.Adam(learning_rate=0.1) + optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) @tf.function(autograph=False) def optimization_step(): with tf.GradientTape() as tape: diff --git a/tensorflow_probability/python/sts/default_model.py b/tensorflow_probability/python/sts/default_model.py index fb1f138425..988e6328b2 100644 --- a/tensorflow_probability/python/sts/default_model.py +++ b/tensorflow_probability/python/sts/default_model.py @@ -95,7 +95,7 @@ def build_default_model(observed_time_series, losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=1000, convergence_criterion=( tfp.optimizer.convergence_criteria.SuccessiveGradientsAreUncorrelated( diff --git a/tensorflow_probability/python/sts/default_model_test.py b/tensorflow_probability/python/sts/default_model_test.py index d96b2a471d..ede2c6b6c3 100644 --- a/tensorflow_probability/python/sts/default_model_test.py +++ b/tensorflow_probability/python/sts/default_model_test.py @@ -111,7 +111,7 @@ def test_docstring_fitting_example(self): _ = optimization.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=1000, convergence_criterion=(successive_gradients_are_uncorrelated .SuccessiveGradientsAreUncorrelated( diff --git a/tensorflow_probability/python/sts/fitting.py b/tensorflow_probability/python/sts/fitting.py index a5cf0f4ee1..8daea0c33a 100644 --- a/tensorflow_probability/python/sts/fitting.py +++ b/tensorflow_probability/python/sts/fitting.py @@ -132,7 +132,7 @@ def build_factored_surrogate_posterior( loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=200) posterior_samples = surrogate_posterior.sample(50) @@ -152,7 +152,7 @@ def loss_fn(): surrogate_posterior, sample_size=10) - optimizer = tf.optimizers.Adam(learning_rate=0.1) + optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) for step in range(200): with tf.GradientTape() as tape: loss = loss_fn() diff --git a/tensorflow_probability/python/sts/forecast.py b/tensorflow_probability/python/sts/forecast.py index 32c2322571..c23154e69e 100644 --- a/tensorflow_probability/python/sts/forecast.py +++ b/tensorflow_probability/python/sts/forecast.py @@ -120,7 +120,7 @@ def one_step_predictive(model, observed_time_series, parameter_samples, loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=200) samples = surrogate_posterior.sample(30) @@ -272,7 +272,7 @@ def forecast(model, loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=200) samples = surrogate_posterior.sample(30) diff --git a/tensorflow_probability/python/sts/structural_time_series.py b/tensorflow_probability/python/sts/structural_time_series.py index 37475353b1..9b3a4c92fb 100644 --- a/tensorflow_probability/python/sts/structural_time_series.py +++ b/tensorflow_probability/python/sts/structural_time_series.py @@ -346,7 +346,7 @@ def joint_distribution(self, losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=jd.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), num_steps=200) parameter_samples = surrogate_posterior.sample(50) diff --git a/tensorflow_probability/python/util/deferred_tensor.py b/tensorflow_probability/python/util/deferred_tensor.py index 7b0f9f52fe..4d846a2577 100644 --- a/tensorflow_probability/python/util/deferred_tensor.py +++ b/tensorflow_probability/python/util/deferred_tensor.py @@ -156,7 +156,7 @@ class DeferredTensor(six.with_metaclass( Which we could then fit as: ```python - opt = tf.optimizers.Adam(learning_rate=0.05) + opt = tf.keras.optimizers.Adam(learning_rate=0.05) loss = tf.function(lambda: -trainable_normal.log_prob(0.5), autograph=True) for _ in range(int(1e3)): opt.minimize(loss, trainable_normal.trainable_variables) @@ -477,7 +477,7 @@ class TransformedVariable(DeferredTensor): g = tape.gradient(negloglik, trainable_normal.trainable_variables) # ==> (-0.5, 0.75) - opt = tf.optimizers.Adam(learning_rate=0.05) + opt = tf.keras.optimizers.Adam(learning_rate=0.05) loss = tf.function(lambda: -trainable_normal.log_prob(0.5)) for _ in range(int(1e3)): opt.minimize(loss, trainable_normal.trainable_variables) diff --git a/tensorflow_probability/python/vi/optimization.py b/tensorflow_probability/python/vi/optimization.py index c06f31cb98..233499c7db 100644 --- a/tensorflow_probability/python/vi/optimization.py +++ b/tensorflow_probability/python/vi/optimization.py @@ -442,8 +442,8 @@ def fit_surrogate_posterior(target_log_prob_fn, transformations of unconstrained variables, so that the transformations execute at runtime instead of at distribution creation. optimizer: Optimizer instance to use. This may be a TF1-style - `tf.train.Optimizer`, TF2-style `tf.optimizers.Optimizer`, or any Python - object that implements `optimizer.apply_gradients(grads_and_vars)`. + `tf.train.Optimizer`, TF2-style `tf.keras.optimizers.Optimizer`, or any + Python object that implements `optimizer.apply_gradients(grads_and_vars)`. num_steps: Python `int` number of steps to run the optimizer. convergence_criterion: Optional instance of `tfp.optimizer.convergence_criteria.ConvergenceCriterion` @@ -522,7 +522,7 @@ def log_prob(z, x): losses = tfp.vi.fit_surrogate_posterior( conditioned_log_prob, surrogate_posterior=q_z, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=100) print(q_z.mean(), q_z.stddev()) # => approximately [2.5, 1/sqrt(2)] ``` @@ -535,7 +535,7 @@ def log_prob(z, x): losses = tfp.vi.fit_surrogate_posterior( conditioned_log_prob, surrogate_posterior=q_z, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=100, discrepancy_fn=tfp.vi.kl_forward) ``` @@ -589,7 +589,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z, importance_sample_size=10, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=200) # Estimate posterior statistics with importance sampling. @@ -680,7 +680,7 @@ def variational_model_fn(): losses, log_amplitude_path, sample_path = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=lambda *args: model.log_prob(args), surrogate_posterior=q, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), sample_size=1, num_steps=500, trace_fn=lambda loss, grads, vars: (loss, kernel_log_amplitude, diff --git a/tensorflow_probability/python/vi/optimization_test.py b/tensorflow_probability/python/vi/optimization_test.py index 1b4a872bfe..0db4a217ff 100644 --- a/tensorflow_probability/python/vi/optimization_test.py +++ b/tensorflow_probability/python/vi/optimization_test.py @@ -79,7 +79,7 @@ def trainable_log_prob(z): q, num_steps=1000, sample_size=10, - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.keras.optimizers.Adam(0.1), seed=seed) self.evaluate(tf1.global_variables_initializer()) with tf.control_dependencies([loss_curve]): @@ -112,7 +112,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z, importance_sample_size=10, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=opt_seed) self.evaluate(tf1.global_variables_initializer()) @@ -140,7 +140,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z_again, importance_sample_size=10, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=opt_seed) self.evaluate(tf1.global_variables_initializer()) @@ -172,7 +172,7 @@ def trainable_q_fn(): q, num_steps=1000, sample_size=100, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), seed=seed) self.evaluate(tf1.global_variables_initializer()) loss_curve_ = self.evaluate((loss_curve)) @@ -230,7 +230,7 @@ def variational_model_fn(): losses, sample_path = optimization.fit_surrogate_posterior( target_log_prob_fn=lambda *args: model.log_prob(args), surrogate_posterior=q, - optimizer=tf.optimizers.Adam(learning_rate=0.1), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=test_util.test_seed(), sample_size=1,