diff --git a/Project.toml b/Project.toml
index 881e9a3c8..e02d22bb5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,4 +34,4 @@ julia = "1.6"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test"]
\ No newline at end of file
diff --git a/docs/.gitignore b/docs/.gitignore
deleted file mode 100644
index ba39cc531..000000000
--- a/docs/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-Manifest.toml
diff --git a/docs/Project.toml b/docs/Project.toml
index 01db6a719..6dfe81a83 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Gen = "ea4f424c-a589-11e8-07c0-fd5c91b9da4a"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 
 [compat]
-Documenter = "0.27"
+Documenter = "1"
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..78ddd3f69
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,24 @@
+# Website Docs
+- `pages.jl` to find skeleton of website.
+- `make.jl` to build the website index.
+
+The docs are divided in roughly four sections:
+- Getting Started + Tutorials
+- How-to Guides
+- API = Modeling API + Inference API
+- Explanations + Internals
+
+
+# Developing
+To build the docs, run `julia --make.jl` or alternatively startup the Julia REPL and include `make.jl`. For debugging, consider setting `draft=true` in the `makedocs` function found in `make.jl`.
+Currently you must write the tutorial directly in the docs rather than a source file (e.g. Quarto). See `getting_started` or `tutorials` for examples.
+
+Code snippets must use the triple backtick with a label to run. The environment carries over so long as the labels match. Example:
+
+```@example tutorial_1
+x = rand()
+```
+
+```@example tutorial_1
+print(x)
+```
\ No newline at end of file
diff --git a/docs/build_docs_locally.sh b/docs/build_docs_locally.sh
deleted file mode 100755
index 40d6d9002..000000000
--- a/docs/build_docs_locally.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-# run this script from the Gen/ directory, it will generate HTML documentation under docs/build
-
-julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
-julia --project=docs/ docs/make.jl
diff --git a/docs/make.jl b/docs/make.jl
index 9c641d022..92a7cd065 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,37 +1,20 @@
+# Run: julia --project make.jl
 using Documenter, Gen
 
+include("pages.jl")
 makedocs(
-    sitename = "Gen",
     modules = [Gen],
-    pages = [
-        "Home" => "index.md",
-        "Getting Started" => "getting_started.md",
-        "Tutorials" => "tutorials.md",
-        "Modeling Languages and APIs" => [
-            "Generative Functions" => "ref/gfi.md",
-            "Probability Distributions" => "ref/distributions.md",
-            "Built-in Modeling Language" => "ref/modeling.md",
-            "Generative Function Combinators" => "ref/combinators.md",
-            "Choice Maps" => "ref/choice_maps.md",
-            "Selections" => "ref/selections.md",
-            "Optimizing Trainable Parameters" => "ref/parameter_optimization.md",
-            "Trace Translators" => "ref/trace_translators.md",
-            "Extending Gen" => "ref/extending.md"
-         ],
-        "Standard Inference Library" => [
-            "Importance Sampling" => "ref/importance.md",
-            "MAP Optimization" => "ref/map.md",
-            "Markov chain Monte Carlo" => "ref/mcmc.md",
-            "MAP Optimization" => "ref/map.md",
-            "Particle Filtering" => "ref/pf.md",
-            "Variational Inference" => "ref/vi.md",
-            "Learning Generative Functions" => "ref/learning.md"
-        ],
-        "Internals" => [
-            "Optimizing Trainable Parameters" => "ref/internals/parameter_optimization.md",
-            "Modeling Language Implementation" => "ref/internals/language_implementation.md"
-         ]
-    ]
+    doctest = false,
+    clean = true,
+    warnonly = true,
+    format = Documenter.HTML(;
+        assets = String["assets/header.js", "assets/header.css", "assets/theme.css"],
+        collapselevel=1,
+    ),
+    sitename = "Gen.jl",
+    pages = pages,
+    checkdocs=:exports,
+    pagesonly=true,
 )
 
 deploydocs(
diff --git a/docs/pages.jl b/docs/pages.jl
new file mode 100644
index 000000000..dc340eb7e
--- /dev/null
+++ b/docs/pages.jl
@@ -0,0 +1,53 @@
+pages = [
+    "Home" => "index.md",
+    "Getting Started" => [
+        "Example 1: Linear Regression" => "getting_started/linear_regression.md",
+    ],
+    "Tutorials" => [
+        "Basics" => [
+            "tutorials/basics/modeling_in_gen.md",
+            "tutorials/basics/gfi.md",
+            "tutorials/basics/combinators.md",
+            "tutorials/basics/particle_filter.md",
+            "tutorials/basics/vi.md",
+        ],
+        "Advanced" => [
+            "tutorials/trace_translators.md",
+        ],
+        "Model Optmizations" => [
+            "Speeding Inference with the Static Modeling Language" => "tutorials/model_optimizations/scaling_with_sml.md",
+        ],
+    ],
+    "How-to Guides" => [
+        "MCMC Kernels" => "how_to/mcmc_kernels.md",
+        "Custom Distributions" => "how_to/custom_distributions.md",
+        "Custom Modeling Languages" => "how_to/custom_dsl.md",
+        "Custom Gradients" => "how_to/custom_derivatives.md",
+        "Incremental Computation" => "how_to/custom_incremental_computation.md",
+    ],
+    "API Reference" => [
+        "Modeling Library" => [
+            "Generative Functions" => "api/model/gfi.md",
+            "Probability Distributions" => "api/model/distributions.md",
+            "Choice Maps" => "api/model/choice_maps.md",
+            "Built-in Modeling Languages" => "api/model/modeling.md",
+            "Combinators" => "api/model/combinators.md",
+            "Selections" => "api/model/selections.md",
+            "Optimizing Trainable Parameters" => "api/model/parameter_optimization.md",
+            "Trace Translators" => "api/model/trace_translators.md",
+        ],
+        "Inference Library" => [
+            "Importance Sampling" => "api/inference/importance.md",
+            "MAP Optimization" => "api/inference/map.md",
+            "Markov chain Monte Carlo" => "api/inference/mcmc.md",
+            "MAP Optimization" => "api/inference/map.md",
+            "Particle Filtering" => "api/inference/pf.md",
+            "Variational Inference" => "api/inference/vi.md",
+            "Learning Generative Functions" => "api/inference/learning.md"
+        ],
+    ],
+    "Explanation and Internals" => [
+        "Modeling Language Implementation" => "explanations/language_implementation.md",
+        "explanations/combinator_design.md"
+    ]
+]
diff --git a/docs/src/ref/importance.md b/docs/src/api/inference/importance.md
similarity index 100%
rename from docs/src/ref/importance.md
rename to docs/src/api/inference/importance.md
diff --git a/docs/src/ref/learning.md b/docs/src/api/inference/learning.md
similarity index 99%
rename from docs/src/ref/learning.md
rename to docs/src/api/inference/learning.md
index 2192793fc..daa62eb85 100644
--- a/docs/src/ref/learning.md
+++ b/docs/src/api/inference/learning.md
@@ -209,7 +209,7 @@ Then, the traces of the model can be obtained by simulating from the variational
 Instead of fitting the variational approximation from scratch for each observation, it is possible to fit an *inference model* instead, that takes as input the observation, and generates a distribution on latent variables as output (as in the wake sleep algorithm).
 When we train the variational approximation by minimizing the evidence lower bound (ELBO) this is called amortized variational inference.
 Variational autencoders are an example.
-It is possible to perform amortized variational inference using [`black_box_vi`](@ref) or [`black_box_vimco!`](@ref).
+It is possible to perform amortized variational inference using [`black_box_vi!`](@ref) or [`black_box_vimco!`](@ref).
 
 ## References
 
diff --git a/docs/src/ref/map.md b/docs/src/api/inference/map.md
similarity index 100%
rename from docs/src/ref/map.md
rename to docs/src/api/inference/map.md
diff --git a/docs/src/api/inference/mcmc.md b/docs/src/api/inference/mcmc.md
new file mode 100644
index 000000000..bd6df6d00
--- /dev/null
+++ b/docs/src/api/inference/mcmc.md
@@ -0,0 +1,19 @@
+# Markov chain Monte Carlo (MCMC)
+
+Gen supports standard Markov Chain Monte Carlo algorithms and allows users to write their own custom kernels.
+```@index
+Pages = ["mcmc.md"]
+```
+
+```@docs
+metropolis_hastings
+mh
+mala
+hmc
+elliptical_slice
+@pkern
+@kern
+@rkern
+reversal
+involutive_mcmc
+```
diff --git a/docs/src/ref/pf.md b/docs/src/api/inference/pf.md
similarity index 100%
rename from docs/src/ref/pf.md
rename to docs/src/api/inference/pf.md
diff --git a/docs/src/api/inference/vi.md b/docs/src/api/inference/vi.md
new file mode 100644
index 000000000..4d55fb43c
--- /dev/null
+++ b/docs/src/api/inference/vi.md
@@ -0,0 +1,7 @@
+## Variational inference
+There are two procedures in the inference library for performing black box variational inference.
+Each of these procedures can also train the model using stochastic gradient descent, as in a variational autoencoder.
+```@docs
+black_box_vi!
+black_box_vimco!
+```
diff --git a/docs/src/ref/choice_maps.md b/docs/src/api/model/choice_maps.md
similarity index 87%
rename from docs/src/ref/choice_maps.md
rename to docs/src/api/model/choice_maps.md
index c065b1b32..b5220c2ba 100644
--- a/docs/src/ref/choice_maps.md
+++ b/docs/src/api/model/choice_maps.md
@@ -30,7 +30,13 @@ Choice maps also implement:
 - `==`, which tests if two choice maps have the same addresses and values at those addresses.
 
 
-## Mutable Choice Maps
+
+```@docs
+DynamicChoiceMap
+EmptyChoiceMap
+StaticChoiceMap
+choicemap
+```
 
 A mutable choice map can be constructed with [`choicemap`](@ref), and then populated:
 ```julia
@@ -45,8 +51,18 @@ There is also a constructor that takes initial (address, value) pairs:
 choices = choicemap((:x, true), ("foo", 1.25), (:y => 1 => :z, -6.3))
 ```
 
+
 ```@docs
-choicemap
 set_value!
 set_submap!
+Base.merge(::ChoiceMap, ::ChoiceMap)
+Base.merge(::ChoiceMap, ::Vararg{ChoiceMap})
+Base.isempty(::ChoiceMap)
 ```
+
+```@docs
+Gen.pair
+Gen.unpair
+Gen.ChoiceMapNestedView
+```
+
diff --git a/docs/src/ref/combinators.md b/docs/src/api/model/combinators.md
similarity index 99%
rename from docs/src/ref/combinators.md
rename to docs/src/api/model/combinators.md
index f02819763..229896523 100644
--- a/docs/src/ref/combinators.md
+++ b/docs/src/api/model/combinators.md
@@ -112,7 +112,9 @@ FunctionalCollections.PersistentVector{Any}[true, false, true, false, true]
 
 ## Recurse combinator
 
-TODO: document me
+```@docs
+Recurse
+```
 
 ```@raw html
 <div style="text-align:center">
@@ -161,3 +163,4 @@ The resulting trace contains the subtrace from the branch with index `2` - in th
 │
 └── :z : 13.552870875213735
 ```
+
diff --git a/docs/src/ref/distributions.md b/docs/src/api/model/distributions.md
similarity index 98%
rename from docs/src/ref/distributions.md
rename to docs/src/api/model/distributions.md
index f4e8ce772..d3828b59f 100644
--- a/docs/src/ref/distributions.md
+++ b/docs/src/api/model/distributions.md
@@ -1,4 +1,11 @@
-# Probability Distributions
+# [Probability Distributions](@id distributions)
+
+```@docs
+random
+logpdf
+has_output_grad
+logpdf_grad
+```
 
 Gen provides a library of built-in probability distributions, and four ways of
 defining custom distributions, each of which are explained below:
@@ -39,6 +46,7 @@ piecewise_uniform
 poisson
 uniform
 uniform_discrete
+broadcasted_normal
 ```
 
 ## [Defining New Distributions Inline with the `@dist` DSL](@id dist_dsl)
diff --git a/docs/src/api/model/gfi.md b/docs/src/api/model/gfi.md
new file mode 100644
index 000000000..0385988be
--- /dev/null
+++ b/docs/src/api/model/gfi.md
@@ -0,0 +1,55 @@
+## [Generative Functions](@id gfi_api)
+
+```@docs
+GenerativeFunction
+Trace
+```
+
+The complete set of methods in the generative function interface (GFI) is:
+
+```@docs
+simulate
+generate
+update
+regenerate
+get_args
+get_retval
+get_choices
+get_score
+get_gen_fn
+Base.getindex
+project
+propose
+assess
+has_argument_grads
+has_submap
+accepts_output_grad
+accumulate_param_gradients!
+choice_gradients
+get_params
+```
+
+```@docs
+Diff
+NoChange
+UnknownChange
+SetDiff
+Diffed
+```
+
+```@docs
+CustomUpdateGF
+apply_with_state
+update_with_state
+```
+
+```@docs
+CustomGradientGF
+apply
+gradient
+```
+
+```@docs
+Gen.init_update_state
+Gen.apply_update!
+```
\ No newline at end of file
diff --git a/docs/src/ref/modeling.md b/docs/src/api/model/modeling.md
similarity index 95%
rename from docs/src/ref/modeling.md
rename to docs/src/api/model/modeling.md
index a89367fd7..2e3c319a2 100644
--- a/docs/src/ref/modeling.md
+++ b/docs/src/api/model/modeling.md
@@ -1,4 +1,4 @@
-# Built-in Modeling Language
+# [The Dynamic Modeling Language](@id dynamic_modeling_language)
 
 Gen provides a built-in embedded modeling language for defining generative functions.
 The language uses a syntax that extends Julia's syntax for defining regular Julia functions, and is also referred to as the **Dynamic Modeling Language**.
@@ -29,14 +29,14 @@ We can also trace its execution:
 ```
 Optional arguments can be left out of the above operations, and default values will be filled in automatically:
 ```julia
-julia> (trace, _) = generate(foo, (,));
+julia> (trace, _) = generate(foo, ())
 julia> get_args(trace)
 (0.1,)
 ```
-See [Generative Functions](@ref) for the full set of operations supported by a generative function.
+See [Generative Functions](@ref gfi_api) for the full set of operations supported by a generative function.
 Note that the built-in modeling language described in this section is only one of many ways of defining a generative function -- generative functions can also be constructed using other embedded languages, or by directly implementing the methods of the generative function interface.
 However, the built-in modeling language is intended to being flexible enough cover a wide range of use cases.
-In the remainder of this section, we refer to generative functions defined using the built-in modeling language as `@gen` functions. Details about the implementation of `@gen` functions can be found in the [Modeling Language Implementation](@ref) section.
+In the remainder of this section, we refer to generative functions defined using the built-in modeling language as `@gen` functions. Details about the implementation of `@gen` functions can be found in the [Modeling Language Implementation](@ref language-implementation) section.
 
 ## Annotations
 
@@ -57,7 +57,7 @@ Each argument can have the following different syntactic forms:
 
 Currently, the possible argument annotations are:
 
-- `grad` (see [Differentiable programming](@ref)).
+- `grad` (see [Differentiable programming](@ref differentiable_modeling)).
 
 **Function annotations.** The `@gen` function itself can also be optionally associated with zero or more annotations, which are separate from the per-argument annotations.
 Function-level annotations use the following different syntactic forms:
@@ -70,11 +70,11 @@ Function-level annotations use the following different syntactic forms:
 
 Currently the possible function annotations are:
 
-- `grad` (see [Differentiable programming](@ref)).
+- `grad` (see [Differentiable programming](@ref differentiable_modeling)).
 
-- `static` (see [Static Modeling Language](@ref)).
+- `static` (see [Static Modeling Language](@ref sml)).
 
-- `nojuliacache` (see [Static Modeling Language](@ref)).
+- `nojuliacache` (see [Static Modeling Language](@ref sml)).
 
 ## Making random choices
 
@@ -82,7 +82,7 @@ Random choices are made by calling a probability distribution on some arguments:
 ```julia
 val::Bool = bernoulli(0.5)
 ```
-See [Probability Distributions](@ref) for the set of built-in probability distributions, and for information on implementing new probability distributions.
+See [Probability Distributions](@ref distributions) for the set of built-in probability distributions, and for information on implementing new probability distributions.
 
 In the body of a `@gen` function, wrapping a call to a random choice with an `@trace` expression associates the random choice with an *address*, and evaluates to the value of the random choice.
 The syntax is:
@@ -145,7 +145,7 @@ It is recommended to write disciplined generative functions when possible.
 
 **Untraced call**:
 If `foo` is a generative function, we can invoke `foo` from within the body of a `@gen` function using regular call syntax.
-The random choices made within the call are not given addresses in our trace, and are therefore *untraced* random choices (see [Generative Function Interface](@ref) for details on untraced random choices).
+The random choices made within the call are not given addresses in our trace, and are therefore *untraced* random choices (see [Generative Function Interface](@ref gfi) for details on untraced random choices).
 ```julia
 val = foo(0.5)
 ```
@@ -247,10 +247,10 @@ Note that `~` is also defined in `Base` as a unary operator that performs the bi
 
 Like regular Julia functions, `@gen` functions return either the expression used in a `return` keyword, or by evaluating the last expression in the function body.
 Note that the return value of a `@gen` function is different from a trace of `@gen` function, which contains the return value associated with an execution as well as the assignment to each random choice made during the execution.
-See [Generative Function Interface](@ref) for more information about traces.
+See [Generative Function Interface](@ref gfi) for more information about traces.
 
 
-## Trainable parameters
+## [Trainable Parameters](@id trainable_parameters_modeling)
 
 A `@gen` function may begin with an optional block of *trainable parameter declarations*.
 The block consists of a sequence of statements, beginning with `@param`, that declare the name and Julia type for each trainable parameter.
@@ -274,14 +274,17 @@ The following methods are exported for the trainable parameters of `@gen` functi
 init_param!
 get_param
 get_param_grad
+set_param_grad!
 set_param!
 zero_param_grad!
+accumulate_param_gradients_determ!
+gradient_with_state
 ```
 
 Trainable parameters are designed to be trained using gradient-based methods.
 This is discussed in the next section.
 
-## Differentiable programming
+## [Differentiable Programming](@id differentiable_modeling)
 
 Given a trace of a `@gen` function, Gen supports automatic differentiation of the log probability (density) of all of the random choices made in the trace with respect to the following types of inputs:
 
@@ -371,7 +374,7 @@ See [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) for more details.
 When making a random choice, each argument is either a tracked value or not.
 If the argument is a tracked value, then the probability distribution must support differentiation of the log probability (density) with respect to that argument.
 Otherwise, an error is thrown.
-The [`has_argument_grads`](@ref) function indicates which arguments support differentiation for a given distribution (see [Probability Distributions](@ref)).
+The [`has_argument_grads`](@ref) function indicates which arguments support differentiation for a given distribution (see [Probability Distributions](@ref distributions)).
 If the gradient is required for the *value* of a random choice, the distribution must support differentiation of the log probability (density) with respect to the value.
 This is indicated by the [`has_output_grad`](@ref) function.
 
@@ -381,7 +384,7 @@ It is an error if a tracked value is passed as an argument of a generative funct
 If a generative function `gen_fn` has `accepts_output_grad(gen_fn) = true`, then the return value of the generative function call will be tracked and will propagate further through the caller `@gen` function's computation.
 
 
-## Static Modeling Language
+## [Static Modeling Language](@id sml)
 
 The *static modeling language* is a restricted variant of the built-in modeling language.
 Models written in the static modeling language can result in better inference performance (more inference operations per second and less memory consumption), than the full built-in modeling language, especially for models used with iterative inference algorithms like Markov chain Monte Carlo.
@@ -399,7 +402,7 @@ end
 ```
 After running this code, `foo` is a Julia value whose type is a subtype of `StaticIRGenerativeFunction`, which is a subtype of [`GenerativeFunction`](@ref).
 
-### Static computation graph
+### Static Computation Graphs
 Using the `static` annotation instructs Gen to statically construct a directed acyclic graph for the computation represented by the body of the function.
 For the function `foo` above, the static graph looks like:
 ```@raw html
@@ -431,7 +434,7 @@ First, the definition of a `(static)` generative function is always expected to
 Next, in order to be able to construct the static graph, Gen restricts the permitted syntax that can be used in functions annotated with `static`.
 In particular, each statement in the body must be one of the following:
 
-- A `@param` statement specifying any [Trainable parameters](@ref), e.g.:
+- A `@param` statement specifying any [trainable parameters](@ref trainable_parameters_modeling), e.g.:
 
 ```julia
 @param theta::Float64
diff --git a/docs/src/ref/parameter_optimization.md b/docs/src/api/model/parameter_optimization.md
similarity index 96%
rename from docs/src/ref/parameter_optimization.md
rename to docs/src/api/model/parameter_optimization.md
index 60e05f41d..50f4b323c 100644
--- a/docs/src/ref/parameter_optimization.md
+++ b/docs/src/api/model/parameter_optimization.md
@@ -1,4 +1,4 @@
-# Optimizing Trainable Parameters
+# Trainable Parameters(@trainable_parameter_optimization)
 
 Trainable parameters of generative functions are initialized differently depending on the type of generative function.
 Trainable parameters of the built-in modeling language are initialized with [`init_param!`](@ref).
@@ -31,3 +31,4 @@ GradientDescent
 ADAM
 ```
 For adding new types of update configurations, see [Optimizing Trainable Parameters (Internal)](@ref optimizing-internal).
+
diff --git a/docs/src/ref/selections.md b/docs/src/api/model/selections.md
similarity index 98%
rename from docs/src/ref/selections.md
rename to docs/src/api/model/selections.md
index 152f98b61..86b746bbc 100644
--- a/docs/src/ref/selections.md
+++ b/docs/src/api/model/selections.md
@@ -23,7 +23,6 @@ If we use this selection in the context of a trace of the function `bar` below,
     @trace(normal(0, 1), :z)
     @trace(normal(0, 1), :w)
 end
-end
 
 @gen function bar()
     @trace(bernoulli(0.5), :x)
@@ -57,3 +56,6 @@ DynamicSelection
 StaticSelection
 ComplementSelection
 ```
+```@docs
+Gen.get_address_schema
+```
diff --git a/docs/src/api/model/trace_translators.md b/docs/src/api/model/trace_translators.md
new file mode 100644
index 000000000..14de30ec9
--- /dev/null
+++ b/docs/src/api/model/trace_translators.md
@@ -0,0 +1,20 @@
+## Trace Translators
+
+```@docs
+@transform
+@read
+@write
+@copy
+@tcall
+pair_bijections!
+is_involution!
+inverse
+TraceTranslator
+DeterministicTraceTranslator
+GeneralTraceTranslator
+SimpleExtendingTraceTranslator
+SymmetricTraceTranslator
+```
+```@docs
+TraceTransformDSLProgram
+```
diff --git a/docs/src/assets/header.css b/docs/src/assets/header.css
new file mode 100644
index 000000000..155b10a3b
--- /dev/null
+++ b/docs/src/assets/header.css
@@ -0,0 +1,138 @@
+
+@media all and (max-width: 560px) {
+    header.navigation {
+        position: fixed !important;
+        left:0;
+        top: 0;
+        width: 100%;
+    }
+
+    header.navigation div.container {
+        margin-left: 0rem;
+    }
+
+    header.navigation div.container nav.navbar {
+        min-height: 1rem !important;
+    }
+
+    header.navigation div.container nav.navbar ul.navbar-nav {
+        min-height: 1rem !important;
+        margin-left: 0.5rem !important;
+    }
+
+    header.navigation div.container nav.navbar ul.navbar-nav li.small-item {
+        visibility: visible !important;
+        display: block !important;
+        margin: 0.5rem;
+    }
+    
+    header.navigation div.container nav.navbar ul.navbar-nav li.nav-item {
+        visibility: hidden;
+        display: none;
+    }
+
+    header.navigation div.container nav.navbar ul.navbar-nav li.nav-item a {
+        visibility: hidden;
+        display: none;
+    }
+
+    html:not(.theme--documenter-dark) body #documenter .docs-main {
+        margin-top: 2rem !important;
+    }
+}
+
+@media all and (max-width: 1055px) and (min-width: 561px){
+    header.navigation {
+        position: fixed !important;
+        left:0;
+        top: 0;
+        width: 100%;
+    }
+
+    header.navigation div.container {
+        margin-left: 0rem;
+    }
+    
+    header.navigation div.container nav.navbar ul.navbar-nav {
+        width: 80% !important;
+    }
+}
+
+@media all and (min-width: 1056px) {
+    header.navigation {
+        position: fixed !important;
+        left:0;
+        top: 0;
+        width: 100%;
+    }
+
+    header.navigation div.container {
+        margin-left: 18rem;
+    }
+
+}
+
+html.theme--documenter-dark header.navigation {
+    background-color: #1f2424 !important;
+}
+
+html.theme--documenter-dark header.navigation div.container {
+    border-bottom: 1px solid #5e6d6f;
+}
+
+html.theme--documenter-dark header.navigation div.container nav.navbar {
+    background-color: #1f2424 !important;
+}
+
+html.theme--documenter-dark header.navigation div.container nav.navbar ul.navbar-nav li.nav-item a.nav-link {
+    color: white;
+    transition: color 100ms;
+}
+
+html.theme--documenter-dark header.navigation div.container nav.navbar ul.navbar-nav li.nav-item a.nav-link:hover {
+    color: #0aa8a7
+}
+
+html header.navigation {
+    background-color: white !important;
+}
+
+html header.navigation div.container {
+    border-bottom: 1px solid #dbdbdb;
+}
+
+html header.navigation div.container nav.navbar ul.navbar-nav li.nav-item a.nav-link {
+    color: #222;
+    transition: color 100ms;
+}
+
+html header.navigation div.container nav.navbar ul.navbar-nav li.nav-item a.nav-link:hover {
+    color: #0aa8a7
+}
+
+header.navigation {
+    z-index: 3;
+}
+
+header.navigation div.container nav.navbar ul.navbar-nav {
+    margin-left: 4rem;
+    min-height: 3.25rem;
+    width: 70%;
+    display: flex;
+    align-self: auto;
+    flex-direction: row;
+    justify-content: space-around;
+}
+
+header.navigation div.container nav.navbar ul.navbar-nav li.nav-item {
+    align-self: stretch;
+    align-content: space-around;
+    justify-content: center;
+    display: flex;
+    flex-direction: column;
+}
+
+header.navigation div.container nav.navbar ul.navbar-nav li.small-item {
+    visibility: hidden;
+    display: none;
+}
\ No newline at end of file
diff --git a/docs/src/assets/header.js b/docs/src/assets/header.js
new file mode 100644
index 000000000..2f211b385
--- /dev/null
+++ b/docs/src/assets/header.js
@@ -0,0 +1,98 @@
+// Source: 
+// https://github.com/ReactiveBayes/RxInfer.jl/blob/246b196a3ea29d0b5744ce241a923c7a3b30eaf4/docs/src/assets/header.js#L4
+
+// We add a simple `onload` hook to inject the custom header for our `HTML`-generated pages
+window.onload = function() {
+  // <header class="navigation">
+  const header = document.createElement('header')
+  header.classList.add("navigation")
+  header.appendChild((() => {
+    const container = document.createElement('div')
+    container.classList.add('container')
+    container.appendChild((() => {
+      const nav = document.createElement('nav')
+      nav.classList.add("navbar")
+
+      nav.appendChild((() => {
+        const ul = document.createElement("ul")
+        ul.classList.add("navbar-nav")
+
+        ul.appendChild((() => {
+          const smalllink = document.createElement('li')
+          smalllink.classList.add('small-item')
+          smalllink.appendChild((() => {
+            const a = document.createElement('a')
+            a.classList.add("nav-link")
+            a.href = 'https://gen.dev'
+            a.innerHTML = 'Gen.jl'
+            a.title = 'Gen.jl'
+            return a
+          })())
+          return smalllink
+        })())
+
+        const items = [
+          { title: "Home", link: "https://gen.dev" },
+          { title: "Get Started", link: "https://github.com" },
+          { title: "Documentation", link: "https://www.gen.dev/docs/stable/" },
+          { title: "Examples", link: "https://probcomp.github.io/Gen.jl/" },
+          { title: "Papers", link: "http://probcomp.csail.mit.edu" },
+          { title: "Contact", link: "http://localhost:1313/" },
+          { title: "GitHub", link: "https://github.com/probcomp/", icon: ["fab", "fa-github"] },
+        ]
+
+        items.forEach((item) => {
+          ul.appendChild(((item) => {
+            const li = document.createElement("li")
+            li.classList.add("nav-item")
+            li.appendChild((() => {
+              const a = document.createElement("a")
+
+              if (item.icon !== undefined) {
+                a.appendChild((() => {
+                  const i = document.createElement("i")
+                  i.classList.add(...(item.icon))
+                  return i
+                })())
+              }
+
+              a.classList.add("nav-link")
+              a.href = item.link
+              a.title = item.title
+
+              a.appendChild((() => {
+                const span = document.createElement("span")
+                span.innerHTML = `&nbsp;${item.title}`
+                return span
+              })())
+
+              return a
+            })())
+            return li
+          })(item))
+        })
+        return ul
+      })())
+      return nav
+    })())
+    return container
+  })())
+
+  const documenterTarget = document.querySelector('#documenter');
+
+  documenterTarget.parentNode.insertBefore(header, documenterTarget);
+
+  // Edit broken links in the examples, see issue #70
+  const editOnGithubLinkTarget = document.querySelector('.docs-edit-link');
+
+  if (editOnGithubLinkTarget) {
+    const link = editOnGithubLinkTarget.href;
+    if (link.includes('docs/src/examples')) {
+      const fixedLink = link.replace('docs/src/', '').replace('.md', '.ipynb');
+      editOnGithubLinkTarget.href = fixedLink;
+      console.log('Fixed link for the example: ', fixedLink)
+    }
+  }
+
+}
+
diff --git a/docs/src/assets/theme.css b/docs/src/assets/theme.css
new file mode 100644
index 000000000..67017e105
--- /dev/null
+++ b/docs/src/assets/theme.css
@@ -0,0 +1,27 @@
+html body #documenter .docs-main {
+    margin-top: 4rem; 
+}
+
+html body #documenter .docs-sidebar.visible {
+    margin-top: 0px !important;
+}
+
+html div.light-biglogo {
+  display: block;
+}
+
+html.theme--documenter-dark div.light-biglogo {
+  display: none;
+}
+
+html div.dark-biglogo {
+  display: none;
+}
+
+html.theme--documenter-dark div.dark-biglogo {
+  display: block;
+}
+
+#documenter .docs-main article.content img {
+    width: 100%;
+}
\ No newline at end of file
diff --git a/docs/src/explanations/combinator_design.md b/docs/src/explanations/combinator_design.md
new file mode 100644
index 000000000..2d999e330
--- /dev/null
+++ b/docs/src/explanations/combinator_design.md
@@ -0,0 +1,9 @@
+# Combinator Design and Implementation
+
+Internally, the Combinators use custom trace types.
+```@docs
+Gen.VectorTrace
+Gen.process_all_new!
+Gen.update_recurse_merge
+Gen.update_discard
+```
\ No newline at end of file
diff --git a/docs/src/ref/internals/language_implementation.md b/docs/src/explanations/language_implementation.md
similarity index 66%
rename from docs/src/ref/internals/language_implementation.md
rename to docs/src/explanations/language_implementation.md
index 03f45cc9e..a5434196c 100644
--- a/docs/src/ref/internals/language_implementation.md
+++ b/docs/src/explanations/language_implementation.md
@@ -1,12 +1,14 @@
 # [Modeling Language Implementation](@id language-implementation)
+!!! warning
+    The API described here is internal to Gen's design and is subject to changes with no deprecation.
 
 ## Parsing `@gen` functions
 
-Gen's built-in modeling languages are designed to preserve Julia's syntax as far as possible, apart from the [Tilde syntax](@ref) for calling generative functions, and the restrictions imposed on the [Static Modeling Language](@ref). In order to preserve that syntax, including the use of non-Gen macros within `@gen` functions, we relegate as much of the parsing of `@gen` functions as possible to Julia's macro-expander and parser.
+Gen's built-in modeling languages are designed to preserve Julia's syntax as far as possible, apart from the [Tilde syntax](@ref) for calling generative functions, and the restrictions imposed on the [Static Modeling Language](@ref sml). In order to preserve that syntax, including the use of non-Gen macros within `@gen` functions, we relegate as much of the parsing of `@gen` functions as possible to Julia's macro-expander and parser.
 
 In particular, we adopt an implementation strategy that enforces a separation between the surface syntax associated with Gen-specific macros (i.e., `@trace` and `@param`) and their corresponding implementations, which differ across the Dynamic Modeling Language (DML) and the Static Modeling Language (SML). We do this by introducing the custom expressions `Expr(:gentrace, call, addr)` and `Expr(:genparam, name, type)`, which serve as intermediate representations in the macro-expanded [abstract syntax tree](https://docs.julialang.org/en/v1/manual/metaprogramming/#Program-representation-1).
 
-Each modeling language can then handle these custom expressions in their own manner, either by parsing them to nodes in the [Static Computation Graph](@ref) (for the SML), or by substituting them with their implementations (for the DML). This effectively allows the SML and DML to have separate implementations of `@trace` and `@param`.
+Each modeling language can then handle these custom expressions in their own manner, either by parsing them to nodes in the [Static Computation Graphs](@ref) (for the SML), or by substituting them with their implementations (for the DML). This effectively allows the SML and DML to have separate implementations of `@trace` and `@param`.
 
 For clarity, below is a procedural description of how the `@gen` macro processes Julia function syntax:
 
@@ -15,3 +17,32 @@ For clarity, below is a procedural description of how the `@gen` macro processes
 3. Pass the macro-expanded and de-sugared function body on to `make_static_gen_function` or `make_dynamic_gen_function` accordingly.
 4. For static `@gen` functions, match `:gentrace` expressions when adding address nodes to the static computation graph, and match `:genparam` expressions when adding parameter nodes to the static computation graph. A `StaticIRGenerativeFunction` is then compiled from the static computation graph.
 5. For dynamic `@gen` functions, rewrite any `:gentrace` expression with its implementation `dynamic_trace_impl`, and rewrite any `:genparam` expression with its implementation `dynamic_param_impl`. The rewritten syntax tree is then evaluated as a standard Julia function, which serves as the implementation of the constructed `DynamicDSLFunction`.
+
+```@docs
+Gen.make_dynamic_gen_function
+Gen.dynamic_param_impl
+Gen.rewrite_dynamic_gen_exprs
+Gen.dynamic_trace_impl
+Gen.arg_to_ast
+Gen.escape_default
+Gen.state
+Gen.choice_or_call_at
+```
+## Static Modeling Language Tooling
+
+```@docs
+Gen.StaticIRGenerativeFunction
+Gen.make_static_gen_function
+Gen.gen_node_name
+Gen.parse_static_dsl_line!
+Gen.parse_typed_var
+Gen.parse_julia_expr!
+Gen.parse_param_line!
+Gen.parse_trace_expr!
+Gen.parse_and_rewrite_trace!
+Gen.parse_assignment_line!
+Gen.parse_return_line!
+Gen.parse_static_dsl_function_body!
+Gen.split_addr!
+Gen.resolve_symbols
+```
diff --git a/docs/src/getting_started.md b/docs/src/getting_started/linear_regression.md
similarity index 56%
rename from docs/src/getting_started.md
rename to docs/src/getting_started/linear_regression.md
index 01680544b..73db75845 100644
--- a/docs/src/getting_started.md
+++ b/docs/src/getting_started/linear_regression.md
@@ -1,34 +1,29 @@
-# Getting Started
+# Getting Started with Gen.jl: Linear Regression
 
-## Installation
 
-First, obtain Julia 1.3 or later, available [here](https://julialang.org/downloads/).
+Let's write a short Gen program that does Bayesian linear regression - that is, given a set of observation points in the xy-plane, we want to find a line that fits them "well". What does "well" mean in Bayesian linear regression? Well one way of interpreting this is by proposing a line that makes the data highly likely - as in a high probability of occuring. 
 
-The Gen package can be installed with the Julia package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and then run:
-```
-pkg> add Gen
-```
-To test the installation locally, you can run the tests with:
-```julia
-using Pkg; Pkg.test("Gen")
-```
+First, we need to define a _generative model_ that describes how we believe the points were generated.
 
-## Example
-
-Let's write a short Gen program that does Bayesian linear regression: given a set of points in the (x, y) plane, we want to find a line that fits them well.
+```math
+\mu \sim N(0,2)\\
+b \sim N(0,10)\\
+\epsilon_i \sim N(0,1)\\
+y_i | x_i \sim \mu x_i + b + \epsilon_i
+```
 
-There are three main components to a typical Gen program.
+This model first randomly samples a slope ``\mu`` and an intercept ``b`` from normal distributions to define the line ``y=mx+b``. Next each x-coordinate is evaluated and perturbed with a little noise. Now let's write this as a probabilistic program.
 
-First, we define a _generative model_: a Julia function, extended with some extra syntax, that, conceptually, simulates a fake dataset. The model below samples `slope` and `intercept` parameters, and then for each of the x-coordinates that it accepts as input, samples a corresponding y-coordinate. We name the random choices we make with `@trace`, so we can refer to them in our inference program.
+The description of the line is a mathematical one, but we can write it using normal code constructs. The _generative model_ is a Julia function with a _tilde_ (~) operator for sampling. Observe that the function below looks almost the same as the generative model.
 
-```julia
+```@example linear_regression
 using Gen
 
 @gen function my_model(xs::Vector{Float64})
-    slope = @trace(normal(0, 2), :slope)
-    intercept = @trace(normal(0, 10), :intercept)
+    slope ~ normal(0, 2)
+    intercept ~ normal(0, 10)
     for (i, x) in enumerate(xs)
-        @trace(normal(slope * x + intercept, 1), "y-$i")
+        {"y-$i"} ~ normal(slope * x + intercept, 1)
     end
 end
 ```
@@ -38,7 +33,7 @@ Inference programs are regular Julia code, and make use of Gen's standard infere
 
 The inference program below takes in a data set, and runs an iterative MCMC algorithm to fit `slope` and `intercept` parameters:
 
-```julia
+```@example linear_regression
 function my_inference_program(xs::Vector{Float64}, ys::Vector{Float64}, num_iters::Int)
     # Create a set of constraints fixing the 
     # y coordinates to the observed y values
@@ -53,7 +48,7 @@ function my_inference_program(xs::Vector{Float64}, ys::Vector{Float64}, num_iter
     
     # Iteratively update the slope then the intercept,
     # using Gen's metropolis_hastings operator.
-    for iter=1:num_iters
+    for _=1:num_iters
         (trace, _) = metropolis_hastings(trace, select(:slope))
         (trace, _) = metropolis_hastings(trace, select(:intercept))
     end
@@ -67,7 +62,7 @@ end
 
 Finally, we run the inference program on some data, and get the results:
 
-```julia
+```@example linear_regression
 xs = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]
 ys = [8.23, 5.87, 3.99, 2.59, 0.23, -0.66, -3.53, -6.91, -7.24, -9.90]
 (slope, intercept) = my_inference_program(xs, ys, 1000)
diff --git a/docs/src/how_to/custom_derivatives.md b/docs/src/how_to/custom_derivatives.md
new file mode 100644
index 000000000..e52f92e9c
--- /dev/null
+++ b/docs/src/how_to/custom_derivatives.md
@@ -0,0 +1,25 @@
+# How to Write Custom Gradients
+
+To add a custom gradient for a differentiable deterministic computation, define a concrete subtype of [`CustomGradientGF`](@ref) with the following methods:
+
+- [`apply`](@ref)
+
+- [`gradient`](@ref)
+
+- [`has_argument_grads`](@ref)
+
+For example:
+
+```julia
+struct MyPlus <: CustomGradientGF{Float64} end
+
+Gen.apply(::MyPlus, args) = args[1] + args[2]
+Gen.gradient(::MyPlus, args, retval, retgrad) = (retgrad, retgrad)
+Gen.has_argument_grads(::MyPlus) = (true, true)
+```
+
+# [Optimizing Trainable Parameters](@id optimizing-internal)
+
+To add support for a new type of gradient-based parameter update, create a new type with the following methods defined for the types of generative functions that are to be supported.
+- [`Gen.init_update_state`](@ref)
+- [`Gen.apply_update!`](@ref)
diff --git a/docs/src/ref/extending.md b/docs/src/how_to/custom_distributions.md
similarity index 52%
rename from docs/src/ref/extending.md
rename to docs/src/how_to/custom_distributions.md
index 7f9dfd480..4722bfa29 100644
--- a/docs/src/ref/extending.md
+++ b/docs/src/how_to/custom_distributions.md
@@ -1,102 +1,4 @@
-# Extending Gen
-
-Gen is designed for extensibility.
-To implement behaviors that are not directly supported by the existing modeling languages, users can implement `black-box' generative functions directly, without using built-in modeling language.
-These generative functions can then be invoked by generative functions defined using the built-in modeling language.
-This includes several special cases:
-
-- Extending Gen with custom gradient computations
-
-- Extending Gen with custom incremental computation of return values
-
-- Extending Gen with new modeling languages.
-
-## Custom gradients
-
-To add a custom gradient for a differentiable deterministic computation, define a concrete subtype of [`CustomGradientGF`](@ref) with the following methods:
-
-- [`apply`](@ref)
-
-- [`gradient`](@ref)
-
-- [`has_argument_grads`](@ref)
-
-For example:
-
-```julia
-struct MyPlus <: CustomGradientGF{Float64} end
-
-Gen.apply(::MyPlus, args) = args[1] + args[2]
-Gen.gradient(::MyPlus, args, retval, retgrad) = (retgrad, retgrad)
-Gen.has_argument_grads(::MyPlus) = (true, true)
-```
-
-```@docs
-CustomGradientGF
-apply
-gradient
-```
-
-## Custom incremental computation
-
-Iterative inference techniques like Markov chain Monte Carlo involve repeatedly updating the execution traces of generative models.
-In some cases, the output of a deterministic computation within the model can be incrementally computed during each of these updates, instead of being computed from scratch.
-
-To add a custom incremental computation for a deterministic computation, define a concrete subtype of [`CustomUpdateGF`](@ref) with the following methods:
-
-- [`apply_with_state`](@ref)
-
-- [`update_with_state`](@ref)
-
-- [`has_argument_grads`](@ref)
-
-The second type parameter of `CustomUpdateGF` is the type of the state that may be used internally to facilitate incremental computation within `update_with_state`.
-
-For example, we can implement a function for computing the sum of a vector that efficiently computes the new sum when a small fraction of the vector elements change:
-
-```julia
-struct MyState
-    prev_arr::Vector{Float64}
-    sum::Float64
-end
-
-struct MySum <: CustomUpdateGF{Float64,MyState} end
-
-function Gen.apply_with_state(::MySum, args)
-    arr = args[1]
-    s = sum(arr)
-    state = MyState(arr, s)
-    (s, state)
-end
-
-function Gen.update_with_state(::MySum, state, args, argdiffs::Tuple{VectorDiff})
-    arr = args[1]
-    prev_sum = state.sum
-    retval = prev_sum
-    for i in keys(argdiffs[1].updated)
-        retval += (arr[i] - state.prev_arr[i])
-    end
-    prev_length = length(state.prev_arr)
-    new_length = length(arr)
-    for i=prev_length+1:new_length
-        retval += arr[i]
-    end
-    for i=new_length+1:prev_length
-        retval -= arr[i]
-    end
-    state = MyState(arr, retval)
-    (state, retval, UnknownChange())
-end
-
-Gen.num_args(::MySum) = 1
-```
-
-```@docs
-CustomUpdateGF
-apply_with_state
-update_with_state
-```
-
+# How to Write Custom Gen Distributions
 
 ## [Custom distributions](@id custom_distributions)
 
@@ -113,7 +15,7 @@ Probability distributions are singleton types whose supertype is `Distribution{T
 abstract type Distribution{T} end
 ```
 
-A new Distribution type must implement the following methods:
+A new `Distribution` type must implement the following methods:
 
 - [`random`](@ref)
 
@@ -139,14 +41,7 @@ Distribution values should also be callable, which is a syntactic sugar with the
 bernoulli(0.5) # identical to random(bernoulli, 0.5) and random(Bernoulli(), 0.5)
 ```
 
-```@docs
-random
-logpdf
-has_output_grad
-logpdf_grad
-```
-
-## Custom generative functions
+## [Custom Generative Functions](@id custom_generative_functions)
 
 We recommend the following steps for implementing a new type of generative function, and also looking at the implementation for the [`DynamicDSLFunction`](@ref) type as an example.
 
@@ -229,7 +124,3 @@ If your generative function has trainable parameters, then implement:
 - [`accumulate_param_gradients!`](@ref)
 
 
-## Custom modeling languages
-
-Gen can be extended with new modeling languages by implementing new generative function types, and constructors for these types that take models as input.
-This typically requires implementing the entire generative function interface, and is advanced usage of Gen.
diff --git a/docs/src/how_to/custom_dsl.md b/docs/src/how_to/custom_dsl.md
new file mode 100644
index 000000000..7e503041b
--- /dev/null
+++ b/docs/src/how_to/custom_dsl.md
@@ -0,0 +1,16 @@
+# How to Write a Custom Modeling Languages
+
+Gen can be extended with new modeling languages by implementing new generative function types, and constructors for these types that take models as input.
+This typically requires implementing the entire generative function interface, and is advanced usage of Gen.
+
+
+Gen is designed for extensibility.
+To implement behaviors that are not directly supported by the existing modeling languages, users can implement `black-box' generative functions directly, without using built-in modeling language.
+These generative functions can then be invoked by generative functions defined using the built-in modeling language.
+This includes several special cases:
+
+- Extending Gen with custom gradient computations
+
+- Extending Gen with custom incremental computation of return values
+
+- Extending Gen with new modeling languages.
\ No newline at end of file
diff --git a/docs/src/how_to/custom_incremental_computation.md b/docs/src/how_to/custom_incremental_computation.md
new file mode 100644
index 000000000..b44cb91bd
--- /dev/null
+++ b/docs/src/how_to/custom_incremental_computation.md
@@ -0,0 +1,53 @@
+# How to Customize Incremental Computation
+
+Iterative inference techniques like Markov chain Monte Carlo involve repeatedly updating the execution traces of generative models.
+In some cases, the output of a deterministic computation within the model can be incrementally computed during each of these updates, instead of being computed from scratch.
+
+To add a custom incremental computation for a deterministic computation, define a concrete subtype of [`CustomUpdateGF`](@ref) with the following methods:
+
+- [`apply_with_state`](@ref)
+
+- [`update_with_state`](@ref)
+
+- [`has_argument_grads`](@ref)
+
+The second type parameter of `CustomUpdateGF` is the type of the state that may be used internally to facilitate incremental computation within `update_with_state`.
+
+For example, we can implement a function for computing the sum of a vector that efficiently computes the new sum when a small fraction of the vector elements change:
+
+```julia
+struct MyState
+    prev_arr::Vector{Float64}
+    sum::Float64
+end
+
+struct MySum <: CustomUpdateGF{Float64,MyState} end
+
+function Gen.apply_with_state(::MySum, args)
+    arr = args[1]
+    s = sum(arr)
+    state = MyState(arr, s)
+    (s, state)
+end
+
+function Gen.update_with_state(::MySum, state, args, argdiffs::Tuple{VectorDiff})
+    arr = args[1]
+    prev_sum = state.sum
+    retval = prev_sum
+    for i in keys(argdiffs[1].updated)
+        retval += (arr[i] - state.prev_arr[i])
+    end
+    prev_length = length(state.prev_arr)
+    new_length = length(arr)
+    for i=prev_length+1:new_length
+        retval += arr[i]
+    end
+    for i=new_length+1:prev_length
+        retval -= arr[i]
+    end
+    state = MyState(arr, retval)
+    (state, retval, UnknownChange())
+end
+
+Gen.num_args(::MySum) = 1
+```
\ No newline at end of file
diff --git a/docs/src/how_to/how_to.md b/docs/src/how_to/how_to.md
new file mode 100644
index 000000000..f2f389b42
--- /dev/null
+++ b/docs/src/how_to/how_to.md
@@ -0,0 +1,3 @@
+# "How-to" guides
+
+"How do I do..." but for Gen.
\ No newline at end of file
diff --git a/docs/src/ref/mcmc.md b/docs/src/how_to/mcmc_kernels.md
similarity index 93%
rename from docs/src/ref/mcmc.md
rename to docs/src/how_to/mcmc_kernels.md
index 65cb8b426..12195d28d 100644
--- a/docs/src/ref/mcmc.md
+++ b/docs/src/how_to/mcmc_kernels.md
@@ -1,16 +1,13 @@
-# Markov chain Monte Carlo (MCMC)
+# How to Write Markov Chain Monte Carlo Kernels
 
-Markov chain Monte Carlo (MCMC) is an approach to inference which involves initializing a hypothesis and then repeatedly sampling a new hypotheses given the previous hypothesis by making a change to the previous hypothesis.
-The function that samples the new hypothesis given the previous hypothesis is called the **MCMC kernel** (or `kernel' for short).
-If we design the kernel appropriately, then the distribution of the hypotheses will converge to the conditional (i.e. posterior) distribution as we increase the number of times we apply the kernel.
+Markov chain Monte Carlo (MCMC) is an approach to inference which involves initializing a hypothesis and then repeatedly sampling a new hypotheses given the previous hypothesis by making a change to the previous hypothesis.[^1] 
 
-Gen includes primitives for constructing MCMC kernels and composing them into MCMC algorithms.
-Although Gen encourages you to write MCMC algorithms that converge to the conditional distribution, Gen does not enforce this requirement.
-You may use Gen's MCMC primitives in other ways, including for stochastic optimization.
+[^1]: For background on MCMC, see Andrieu, Christophe, et al. "An introduction to MCMC for machine learning." Machine learning 50.1-2 (2003): 5-43. [Link](https://www.cs.ubc.ca/~arnaud/andrieu_defreitas_doucet_jordan_intromontecarlomachinelearning.pdf).
 
-For background on MCMC see [1].
+The function that samples the new hypothesis given the previous hypothesis is called the **MCMC kernel** (or `kernel' for short). If we design the kernel appropriately, then the distribution of the hypotheses will converge to the conditional (i.e. posterior) distribution as we increase the number of times we apply the kernel.
 
-[1] Andrieu, Christophe, et al. "An introduction to MCMC for machine learning." Machine learning 50.1-2 (2003): 5-43. [Link](https://www.cs.ubc.ca/~arnaud/andrieu_defreitas_doucet_jordan_intromontecarlomachinelearning.pdf).
+Gen includes primitives for constructing MCMC kernels and composing them into MCMC algorithms.
+Although Gen encourages you to write MCMC algorithms that converge to the conditional distribution, Gen does not enforce this requirement. You may use Gen's MCMC primitives in other ways, including for stochastic optimization.
 
 ## MCMC in Gen
 Suppose we are doing inference in the following toy model:
@@ -162,14 +159,14 @@ The range of the for loop may be a deterministic function of the trace (as in `t
 The range must be *invariant* under all possible executions of the body of the for loop.
 For example, the random walk based kernel embedded in the for loop in our example above cannot modify the value of the random choice `:n` in the trace.
 
-**If-end expressions**
+**If-end expressions.**
 The predicate condition may be a deterministic function of the trace, but it also must be invariant (i.e. remain true) under all possible executions of the body.
 
 **Deterministic let expressions.**
 We can use `let x = value .. end` to bind values to a variable, but the expression on the right-hand-side must be deterministic function of its free variables, its value must be invariant under all possible executions of the body.
 
 **Stochastic let expressions.**
-We can use `let x ~ dist(args...) .. end` to sample a stochastic value and bind to a variable, but the expression on the right-hand-side must be the application of a Gen [`Distribution`](@ref) to arguments, and the distribution and its arguments must be invariant under all possible executions of the body.
+We can use `let x ~ dist(args...) .. end` to sample a stochastic value and bind to a variable, but the expression on the right-hand-side must be the application of a Gen [`Distribution`](@ref distributions) to arguments, and the distribution and its arguments must be invariant under all possible executions of the body.
 
 
 ### Declaring primitive kernels for use in composite kernels
@@ -203,15 +200,15 @@ Indeed, they are just regular Julia functions, but with some extra information a
 
 ## Involutive MCMC
 
-Gen's most flexible variant of [`metropolis_hastings`](@ref), called **Involutive MCMC**, allows users to specify any MCMC kernel in the reversible jump MCMC (RJMCMC) framework [2].
+Gen's most flexible variant of [`metropolis_hastings`](@ref), called **Involutive MCMC**, allows users to specify any MCMC kernel in the reversible jump MCMC (RJMCMC) framework. [^2]
 Involution MCMC allows you to express a broad class of custom MCMC kernels that are not expressible using the other, simpler variants of Metropolis-Hastings supported by Gen.
 These kernels are particularly useful for inferring the structure (e.g. control flow) of a model.
 
-[2] Green, Peter J. "Reversible jump Markov chain Monte Carlo computation and Bayesian model determination." Biometrika 82.4 (1995): 711-732. [Link](https://academic.oup.com/biomet/article-abstract/82/4/711/252058)
+[^2]: Green, Peter J. "Reversible jump Markov chain Monte Carlo computation and Bayesian model determination." Biometrika 82.4 (1995): 711-732. [Link](https://academic.oup.com/biomet/article-abstract/82/4/711/252058)
 
 An involutive MCMC kernel in Gen takes as input a previous trace of the model (whose choice map we will denote by ``t``), and performs three phases to obtain a new trace of the model:
 
-- First, it traces the execution of a **proposal**, which is an auxiliary generative function that takes the previous trace of the model as its first argument. Mathematically, we will denote the choice map associated with the trace of the proposal by ``u``. The proposal can of course be defined using the [Built-In Modeling Languages](@ref), just like the model itself. However, unlike many other uses of proposals in Gen, these proposals *can make random choices at addresses that the model does not*.
+- First, it traces the execution of a **proposal**, which is an auxiliary generative function that takes the previous trace of the model as its first argument. Mathematically, we will denote the choice map associated with the trace of the proposal by ``u``. The proposal can of course be defined using the [built-in modeling language](@ref dynamic_modeling_language), just like the model itself. However, unlike many other uses of proposals in Gen, these proposals *can make random choices at addresses that the model does not*.
 
 - Next, it takes the tuple ``(t, u)`` and passes it into an **involution** (denoted mathematically by ``h``), which is a function that returns a new tuple ``(t', u')``, where ``t'`` is the choice map for a new proposed trace of the model, and ``u'`` are random choices for a new trace of the proposal. The defining property of the involution is that *it is invertible*, and *it is its own inverse*; i.e. ``(t, u) = h(h(t, u))``. Intuitively, ``u'`` is a description of a way that the proposal could be reversed, taking ``t'`` to ``t``.
 
@@ -366,19 +363,4 @@ For custom primitive kernels declared with [`@pkern`](@ref), users can declare t
 ```
 This also assigns `k1` as the reversal of `k2`.
 The composite kernel DSL automatically generates the reversal kernel for composite kernels, and built-in stationary kernels like [`mh`](@ref).
-The reversal of a kernel (primitive or composite) can be obtained with [`reversal`](@ref).
-
-
-## API
-```@docs
-metropolis_hastings
-mh
-mala
-hmc
-elliptical_slice
-@pkern
-@kern
-@rkern
-reversal
-involutive_mcmc
-```
+The reversal of a kernel (primitive or composite) can be obtained with [`reversal`](@ref).
\ No newline at end of file
diff --git a/docs/src/images/map_combinator.png b/docs/src/images/map_combinator.png
deleted file mode 100644
index 53716d579..000000000
Binary files a/docs/src/images/map_combinator.png and /dev/null differ
diff --git a/docs/src/images/recurse_combinator.png b/docs/src/images/recurse_combinator.png
deleted file mode 100644
index 3159ab5aa..000000000
Binary files a/docs/src/images/recurse_combinator.png and /dev/null differ
diff --git a/docs/src/images/static_graph.png b/docs/src/images/static_graph.png
deleted file mode 100644
index 8fcdb1ea6..000000000
Binary files a/docs/src/images/static_graph.png and /dev/null differ
diff --git a/docs/src/images/switch_combinator.png b/docs/src/images/switch_combinator.png
deleted file mode 100644
index 98c4e68ea..000000000
Binary files a/docs/src/images/switch_combinator.png and /dev/null differ
diff --git a/docs/src/images/unfold_combinator.png b/docs/src/images/unfold_combinator.png
deleted file mode 100644
index 8fa1b6945..000000000
Binary files a/docs/src/images/unfold_combinator.png and /dev/null differ
diff --git a/docs/src/index.md b/docs/src/index.md
index 161192b99..cc741f22f 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,27 +2,63 @@
 
 *A general-purpose probabilistic programming system with programmable inference, embedded in Julia*
 
-```@contents
-Pages = [
-    "getting_started.md",
-    "tutorials.md",
-    "guide.md",
-]
-Depth = 2
+- What does Gen provide.
+!!! note
+    `Gen.jl` is still under active development. If you find a a bug or wish to share ideas for improvement, feel free to visit the Github site or contact at us here.
+
+## Installation
+
+The Gen package can be installed with the Julia package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and then run:
+```
+pkg> add Gen
+```
+!!! note
+    Alternatively,
+
+    ```julia
+    julia> import Pkg; Pkg.add("Gen")
+    ```
+
+To test the installation locally, you can run the tests with:
+```julia
+using Pkg; Pkg.test("Gen")
 ```
 
-Reference
+## Getting Started
+!!! warning "Tutorial Redirect"
+    We are in the process of moving tutorial docs around. For more stable tutorial docs, see [these tutorials](https://www.gen.dev/tutorials/) instead. See more examples [here](https://github.com/probcomp/GenExamples.jl).
+
+To see a overview of the package, check out the [examples](getting_started/linear_regression.md). For a deep-dive on how to do inference with Gen.jl, check out the [tutorials](@ref introduction_to_modeling_in_gen). 
+
 ```@contents
 Pages = [
-    "ref/modeling.md",
-    "ref/combinators.md",
-    "ref/assignments.md",
-    "ref/selections.md",
-    "ref/parameter_optimization.md",
-    "ref/inference.md",
-    "ref/gfi.md",
-    "ref/distributions.md"
-    "ref/extending.md",
-]
+    "getting_started/linear_regression.md",
+    ]
 Depth = 2
 ```
+
+## Contributing
+See the [Developer's Guide](https://gen.dev) on how to contribute to the Gen ecosystem.
+
+## Supporting and Citing
+This repo is part of ongoing research at [ProbComp](http://probcomp.csail.mit.edu) and may later include new experimental  (for the better)! If you use Gen for your work, please consider citing us:
+
+```bibtex
+@inproceedings{Cusumano-Towner:2019:GGP:3314221.3314642,
+ author = {Cusumano-Towner, Marco F. and Saad, Feras A. and Lew, Alexander K. and Mansinghka, Vikash K.},
+ title = {Gen: A General-purpose Probabilistic Programming System with Programmable Inference},
+ booktitle = {Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation},
+ series = {PLDI 2019},
+ year = {2019},
+ isbn = {978-1-4503-6712-7},
+ location = {Phoenix, AZ, USA},
+ pages = {221--236},
+ numpages = {16},
+ url = {http://doi.acm.org/10.1145/3314221.3314642},
+ doi = {10.1145/3314221.3314642},
+ acmid = {3314642},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {Markov chain Monte Carlo, Probabilistic programming, sequential Monte Carlo, variational inference},
+} 
+```
\ No newline at end of file
diff --git a/docs/src/ref/images/rjmcmc.png b/docs/src/ref/images/rjmcmc.png
deleted file mode 100644
index ce1cdb8d1..000000000
Binary files a/docs/src/ref/images/rjmcmc.png and /dev/null differ
diff --git a/docs/src/ref/internals/parameter_optimization.md b/docs/src/ref/internals/parameter_optimization.md
deleted file mode 100644
index 48d88d494..000000000
--- a/docs/src/ref/internals/parameter_optimization.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# [Optimizing Trainable Parameters](@id optimizing-internal)
-
-To add support for a new type of gradient-based parameter update, create a new type with the following methods defined for the types of generative functions that are to be supported.
-```@docs
-Gen.init_update_state
-Gen.apply_update!
-```
diff --git a/docs/src/tutorials.md b/docs/src/tutorials.md
deleted file mode 100644
index 5e53c8b3e..000000000
--- a/docs/src/tutorials.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Tutorials and Case Studies
-
-See [Gen Quickstart](https://github.com/probcomp/gen-quickstart) repository for tutorials and case studies
-
-Additional examples are available in the [GenExamples.jl](https://github.com/probcomp/GenExamples.jl) repository.
diff --git a/docs/src/tutorials/basics/combinators.md b/docs/src/tutorials/basics/combinators.md
new file mode 100644
index 000000000..ed7542c82
--- /dev/null
+++ b/docs/src/tutorials/basics/combinators.md
@@ -0,0 +1,114 @@
+# [Generative Combinators](@id combinators_tutorial)
+
+Generative function combinators are Julia functions that take one or more generative functions as input and return a new generative function. Generative function combinators are used to express patterns of repeated computation that appear frequently in generative models. Some generative function combinators are similar to higher order functions from functional programming languages.
+
+## Map combinator
+
+In the schematic below, the kernel is denoted ``\mathcal{G}_{\mathrm{k}}``.
+```@raw html
+<div style="text-align:center">
+    <img src="./images/map_combinator.png" alt="schematic of map combinator" width="50%"/>
+</div>
+```
+
+For example, consider the following generative function, which makes one random choice at address `r^2`:
+```@example map_combinator
+using Gen
+@gen function foo(x, y, z)
+    r ~ normal(x^2 + y^2 + z^2, 1.0)
+    return r
+end
+```
+We apply the map combinator to produce a new generative function `bar`:
+```@example map_combinator
+bar = Map(foo)
+```
+We can then obtain a trace of `bar`:
+```@example map_combinator
+trace, _ = generate(bar, ([0.0, 0.5], [0.5, 1.0], [1.0, -1.0]))
+trace
+```
+This causes `foo` to be invoked twice, once with arguments `(0.0, 0.5, 1.0)` in address namespace `1` and once with arguments `(0.5, 1.0, -1.0)` in address namespace `2`.
+
+```@example map_combinator
+get_choices(trace)
+```
+If the resulting trace has random choices:
+then the return value is:
+
+```@example map_combinator
+get_retval(trace)
+```
+
+## Unfold combinator
+
+In the schematic below, the kernel is denoted ``\mathcal{G}_{\mathrm{k}}``.
+The initial state is denoted ``y_0``, the number of applications is ``n``, and the remaining arguments to the kernel not including the state, are ``z``.
+```@raw html
+<div style="text-align:center">
+    <img src="./images/unfold_combinator.png" alt="schematic of unfold combinator" width="70%"/>
+</div>
+```
+
+For example, consider the following kernel, with state type `Bool`, which makes one random choice at address `:z`:
+```@example unfold_combinator
+using Gen
+@gen function foo(t::Int, y_prev::Bool, z1::Float64, z2::Float64)
+    y = @trace(bernoulli(y_prev ? z1 : z2), :y)
+    return y
+end
+```
+We apply the map combinator to produce a new generative function `bar`:
+```@example unfold_combinator
+bar = Unfold(foo)
+```
+We can then obtain a trace of `bar`:
+```@example unfold_combinator
+trace, _ = generate(bar, (5, false, 0.05, 0.95))
+trace
+```
+This causes `foo` to be invoked five times.
+The resulting trace may contain the following random choices:
+```@example unfold_combinator
+get_choices(trace)
+```
+then the return value is:
+```@example unfold_combinator
+get_retval(trace)
+```
+
+## Switch combinator
+
+```@raw html
+<div style="text-align:center">
+    <img src="./images/switch_combinator.png" alt="schematic of switch combinator" width="100%"/>
+</div>
+```
+
+Consider the following constructions:
+
+```@setup switch_combinator
+using Gen
+```
+
+```@example switch_combinator
+@gen function line(x)
+    z ~ normal(3*x+1,1.0)
+    return z
+end
+
+@gen function outlier(x)
+    z ~ normal(3*x+1, 10.0)
+    return z
+end
+
+switch_model = Switch(line, outlier)
+```
+
+This creates a new generative function `switch_model` whose arguments take the form `(branch, args...)`. By default,
+branch is an integer indicating which generative function to execute. For example, branch `2` corresponds to `outlier`:
+
+```@example switch_combinator
+trace = simulate(switch_model, (2, 5.0))
+get_choices(trace)
+```
diff --git a/docs/src/ref/gfi.md b/docs/src/tutorials/basics/gfi.md
similarity index 86%
rename from docs/src/ref/gfi.md
rename to docs/src/tutorials/basics/gfi.md
index c6b03fc7b..723f07865 100644
--- a/docs/src/ref/gfi.md
+++ b/docs/src/tutorials/basics/gfi.md
@@ -1,4 +1,4 @@
-# Generative Functions
+# [Generative Function Interface](@id gfi)
 
 One of the core abstractions in Gen is the **generative function**.
 Generative functions are used to represent a variety of different types of probabilistic computations including generative models, inference models, custom proposal distributions, and variational approximations.
@@ -6,13 +6,11 @@ Generative functions are used to represent a variety of different types of proba
 ## Introduction
 
 Generative functions are represented by the following abstact type:
-```@docs
-GenerativeFunction
-```
 
 There are various kinds of generative functions, which are represented by concrete subtypes of [`GenerativeFunction`](@ref).
-For example, the [Built-in Modeling Language](@ref) allows generative functions to be constructed using Julia function definition syntax:
-```julia
+For example, the [Built-in Modeling Language](@ref dynamic_modeling_language) allows generative functions to be constructed using Julia function definition syntax:
+```@example gfi_tutorial
+using Gen # hide
 @gen function foo(a, b=0)
     if @trace(bernoulli(0.5), :z)
         return a + b + 1
@@ -21,14 +19,14 @@ For example, the [Built-in Modeling Language](@ref) allows generative functions
     end
 end
 ```
-Users can also extend Gen by implementing their own [Custom generative function types](@ref), which can be new modeling languages, or just specialized optimized implementations of a fragment of a specific model.
+Users can also extend Gen by implementing their own [custom generative functions](@ref custom_generative_functions), which can be new modeling languages, or just specialized optimized implementations of a fragment of a specific model.
 
 Generative functions behave like Julia functions in some respects.
 For example, we can call a generative function `foo` on arguments and get an output value using regular Julia call syntax:
-```julia-repl
-julia> foo(2, 4)
-7
+```@example gfi_tutorial
+foo(2, 4)
 ```
+
 However, generative functions are distinct from Julia functions because they support additional behaviors, described in the remainder of this section.
 
 
@@ -52,7 +50,7 @@ The set of valid argument tuples to the function, denoted ``X``.
 A family of probability distributions ``p(t, r; x)`` on maps ``t`` from random choice addresses to their values, and non-addressable randomness ``r``, indexed by arguments ``x``, for all ``x \in X``.
 Note that the distribution must be normalized:
 ```math
-\sum_{t, r} p(t, r; x) = 1 \;\; \mbox{for all} \;\; x \in X
+\sum_{t, r} p(t, r; x) = 1 \;\; \text{for all} \;\; x \in X
 ```
 This corresponds to a requirement that the function terminate with probabability 1 for all valid arguments.
 We use ``p(t; x)`` to denote the marginal distribution on the map ``t``:
@@ -80,17 +78,17 @@ It is typically used by inference programs, for logging and for caching the resu
 A family of probability distributions ``q(t; x, u)`` on maps ``t`` from random choice addresses to their values, indexed by tuples ``(x, u)`` where ``u`` is a map from random choice addresses to values, and where ``x`` are the arguments to the function.
 It must satisfy the following conditions:
 ```math
-\sum_{t} q(t; x, u) = 1 \;\; \mbox{for all} \;\; x \in X, u
+\sum_{t} q(t; x, u) = 1 \;\; \text{for all} \;\; x \in X, u
 ```
 ```math
-p(t; x) > 0 \mbox{ if and only if } q(t; x, u) > 0 \mbox{ for all } u \mbox{ where } u \mbox{ and } t \mbox{ agree }
+p(t; x) > 0 \text{ if and only if } q(t; x, u) > 0 \text{ for all } u \text{ where } u \text{ and } t \text{ agree }
 ```
 ```math
-q(t; x, u) > 0 \mbox{ implies that } u \mbox{ and } t \mbox{ agree }.
+q(t; x, u) > 0 \text{ implies that } u \text{ and } t \text{ agree }.
 ```
 There is also a family of probability distributions ``q(r; x, t)`` on non-addressable randomness, that satisfies:
 ```math
-q(r; x, t) > 0 \mbox{ if and only if } p(r | t, x) > 0
+q(r; x, t) > 0 \text{ if and only if } p(r | t, x) > 0
 ```
 
 
@@ -115,9 +113,7 @@ Traces contain:
 
 
 Different concrete types of generative functions use different data structures and different Julia types for their traces, but traces are subtypes of [`Trace`](@ref).
-```@docs
-Trace
-```
+
 The concrete trace type that a generative function uses is the second type parameter of the [`GenerativeFunction`](@ref) abstract type.
 For example, the trace type of [`DynamicDSLFunction`](@ref) is `DynamicDSLTrace`.
 
@@ -149,10 +145,9 @@ z = trace[:z]
 ```
 
 When a generative function has default values specified for trailing arguments, those arguments can be left out when calling [`simulate`](@ref), [`generate`](@ref), and other functions provided by the generative function interface. The default values will automatically be filled in:
-```julia
-julia> trace = simulate(foo, (2,));
-julia> get_args(trace)
-(2, 0)
+```@example gfi_tutorial
+trace = simulate(foo, (2,));
+get_args(trace)
 ```
 
 ## Updating traces
@@ -161,7 +156,7 @@ It is often important to incrementally modify the trace of a generative function
 In Gen, traces are **functional data structures**, meaning they can be treated as immutable values.
 There are several methods that take a trace of a generative function as input and return a new trace of the generative function based on adjustments to the execution history of the function.
 We will illustrate these methods using the following generative function:
-```julia
+```@example gfi_tutorial
 @gen function bar()
     val = @trace(bernoulli(0.3), :a)
     if @trace(bernoulli(0.4), :b)
@@ -173,17 +168,17 @@ We will illustrate these methods using the following generative function:
     return val
 end
 ```
+
 Suppose we have a trace (`trace`) of `bar` with initial choices:
+```@example gfi_tutorial
+trace, _ = generate(bar, (), choicemap(:a=>false, :b=>true, :c=>false, :e=>true)) # hide
+nothing # hide
 ```
-│
-├── :a : false
-│
-├── :b : true
-│
-├── :c : false
-│
-└── :e : true
+
+```@example gfi_tutorial
+get_choices(trace) # hide
 ```
+
 Note that address `:d` is not present because the branch in which `:d` is sampled was not taken because random choice `:b` had value `true`.
 
 ### Update
@@ -191,34 +186,27 @@ The [`update`](@ref) method takes a trace and generates an adjusted trace that i
 
 **Example.**
 Suppose we run [`update`](@ref) on the example `trace`, with the following constraints:
+
+```@example gfi_tutorial
+choicemap(:b=>false, :d=>true) #hide
 ```
-│
-├── :b : false
-│
-└── :d : true
-```
-```julia
+
+```@example gfi_tutorial
 constraints = choicemap((:b, false), (:d, true))
 (new_trace, w, _, discard) = update(trace, (), (), constraints)
 ```
 Then `get_choices(new_trace)` will be:
+
+```@example gfi_tutorial
+get_choices(new_trace)
 ```
-│
-├── :a : false
-│
-├── :b : false
-│
-├── :d : true
-│
-└── :e : true
-```
+
 and `discard` will be:
+
+```@example gfi_tutorial
+discard
 ```
-│
-├── :b : true
-│
-└── :c : false
-```
+
 Note that the discard contains both the previous values of addresses that were overwritten, and the values for addresses that were in the previous trace but are no longer in the new trace.
 The weight (`w`) is computed as:
 ```math
@@ -229,44 +217,34 @@ w = \log p(t'; x')/p(t; x) = \log 0.0294/0.0784 = \log 0.375
 
 **Example.**
 Suppose we run [`update`](@ref) on the example `trace`, with the following constraints, which *do not* contain a value for `:d`:
+
+```@example gfi_tutorial
+choicemap(:b=>false) # hide
 ```
-│
-└── :b : false
-```
-```julia
+
+```@example gfi_tutorial
 constraints = choicemap((:b, false))
 (new_trace, w, _, discard) = update(trace, (), (), constraints)
 ```
-Then `get_choices(new_trace)` will be:
-```
-│
-├── :a : false
-│
-├── :b : false
-│
-├── :d : true
-│
-└── :e : true
-```
-with probability 0.1, or:
+
+Since `b` is constrained to `false`, the updated trace must now sample at address `d` (note address `e` remains fixed). There two possibilities for `get_choices(new_trace)`:
+
+The first choicemap:
+```@example gfi_tutorial
+choicemap(:a=>false, :b=>false, :d=>true, :e=>true) # hide
 ```
-│
-├── :a : false
-│
-├── :b : false
-│
-├── :d : false
-│
-└── :e : true
+occurs with probability 0.1. The second:
+
+```@example gfi_tutorial
+choicemap(:a=>false, :b=>false, :d=>false, :e=>true) # hide
 ```
-with probability 0.9.
+
+occurs with probability 0.9.
 Also, `discard` will be:
+```@example gfi_tutorial
+discard # hide
 ```
-│
-├── :b : true
-│
-└── :c : false
-```
+
 If the former case occurs and `:d` is assigned to `true`, then the weight (`w`) is computed as:
 ```math
 p(t; x) = 0.7 × 0.4 × 0.4 × 0.7 = 0.0784\\
@@ -281,7 +259,7 @@ The [`regenerate`](@ref) method takes a trace and generates an adjusted trace th
 
 **Example.**
 Suppose we run [`regenerate`](@ref) on the example `trace`, with selection `:a` and `:b`:
-```julia
+```@example gfi_tutorial
 (new_trace, w, _) = regenerate(trace, (), (), select(:a, :b))
 ```
 Then, a new value for `:a` will be sampled from `bernoulli(0.3)`, and a new value for `:b` will be sampled from `bernoulli(0.4)`.
@@ -290,7 +268,10 @@ If the new value for `:b` is `false`, then a new value for `:d` will be sampled
 The previous value for `:c` will always be retained.
 Suppose the new value for `:a` is `true`, and the new value for `:b` is `true`.
 Then `get_choices(new_trace)` will be:
+```@example gfi_tutorial
+get_choices(new_trace)
 ```
+<!-- ```
 │
 ├── :a : true
 │
@@ -299,7 +280,7 @@ Then `get_choices(new_trace)` will be:
 ├── :c : false
 │
 └── :e : true
-```
+``` -->
 The weight (`w`) is ``\log 1 = 0``.
 
 
@@ -315,17 +296,16 @@ This argdiff information permits the implementation of the update method to avoi
 Note that the correctness of the argdiff is in general not verified by Gen---passing incorrect argdiff information may result in incorrect behavior.
 
 The trace update methods for all generative functions above should accept at least the following types of argdiffs:
-```@docs
-NoChange
-UnknownChange
-```
+- [`NoChange`](@ref)
+- [`UnknownChange`](@ref)
+
 Generative functions may also be able to process more specialized diff data types for each of their arguments, that allow more precise information about the different to be supplied.
 
 ### Retdiffs
 
 To enable generative functions that invoke other functions to efficiently make use of incremental computation, the trace update methods of generative functions also return a **retdiff** value, which provides information about the difference in the return value of the previous trace an the return value of the new trace.
 
-## Differentiable programming
+## [Differentiable Programming](@id differentiable_programming_tutorial)
 
 
 The trace of a generative function may support computation of gradients of its log probability with respect to some subset of (i) its arguments, (ii) values of random choice, and (iii) any of its **trainable parameters** (see below).
@@ -349,27 +329,3 @@ Users then use these accumulated gradients to update to the values of the traina
 The set of elements (either arguments, random choices, or trainable parameters) for which gradients are available is called the **gradient source set**.
 If the return value of the function is conditionally dependent on any element in the gradient source set given the arguments and values of all other random choices, for all possible traces of the function, then the generative function requires a *return value gradient* to compute gradients with respect to elements of the gradient source set.
 This static property of the generative function is reported by [`accepts_output_grad`](@ref).
-
-## Generative function interface
-
-The complete set of methods in the generative function interface (GFI) is:
-```@docs
-simulate
-generate
-update
-regenerate
-get_args
-get_retval
-get_choices
-get_score
-get_gen_fn
-Base.getindex
-project
-propose
-assess
-has_argument_grads
-accepts_output_grad
-accumulate_param_gradients!
-choice_gradients
-get_params
-```
diff --git a/docs/src/tutorials/basics/modeling_in_gen.md b/docs/src/tutorials/basics/modeling_in_gen.md
new file mode 100644
index 000000000..7ec032a98
--- /dev/null
+++ b/docs/src/tutorials/basics/modeling_in_gen.md
@@ -0,0 +1,1388 @@
+# [Introduction to Modeling in Gen](@id introduction_to_modeling_in_gen)
+
+Welcome! In this tutorial, you'll get your feet wet with Gen, a
+multi-paradigm platform for probabilistic modeling and inference. By
+"multi-paradigm," we mean that Gen supports many different approaches to
+modeling and inference:
+
+- Unsupervised learning and posterior inference in generative models using
+  Monte Carlo,  variational, EM, and stochastic gradient techniques.
+
+- Supervised learning of conditional inference models (e.g. supervised
+  classification and regression).
+
+- Hybrid approaches including amortized inference / inference compilation,
+  variational autoencoders, and semi-supervised learning.
+
+Don't worry if you haven't seen some of these approaches before. One goal of
+these tutorials will be to introduce you to a subset of them,
+from a unified probabilistic programming perspective.
+
+#### In this Tutorial
+
+Approaching a problem from a probabilistic perspective requires both *modeling*
+and *inference*:
+
+- **Modeling**: You first need to frame the problem — and any assumptions you
+  bring to the table — as a probabilistic model. A huge variety of problems can
+  be viewed from a modeling & inference lens, if you set them up properly. 
+  **This notebook is about how to think of problems in this light, and how to use Gen**
+  **to formally specify your assumptions and the tasks you wish to solve.**
+  
+- **Inference**: You then need to do the hard part: inference, that is, 
+  solving the problem. In this notebook, we'll use a particularly simple 
+  *generic* inference algorithm: importance sampling with the prior as our
+  proposal distributions. With enough computation, the algorithm 
+  can in theory solve any modeling and inference problem, but in practice, for most problems of
+  interest, it is too slow to achieve accurate results in a reasonable amount of time. 
+  **Future tutorials introduce some of Gen's**
+  **programmable inference features**, 
+  which let you tailor the inference algorithm for use with more complex models (Gen will still automate the math!).
+
+Throughout this 
+tutorial,
+we will emphasize key degrees of modeling flexibility
+afforded by the probabilistic programming approach, for example:
+
+- Using a stochastic branching and function abstraction to express
+  uncertainty about which of multiple models is appropriate.
+
+- Representing models with an unbounded number of parameters (a 'Bayesian
+  non-parametric' model) using loops and recursion.
+
+We'll also introduce a technique for validating a model and
+inference algorithm by predicting new data from inferred parameters, and
+comparing this data to the observed data set.
+
+However, this tutorial does not exhaustively cover all features of Gen's modeling language. 
+For example, Gen's modeling combinators and its static modeling language
+enable improved performance, but are not covered here.
+
+## Gen and Julia
+
+Gen is a package for the Julia language. The package can be loaded with:
+
+
+```@example tutorial_1
+using Gen
+```
+
+Gen programs typically consist of a combination of (i) probabilistic models written in modeling languages and (ii) inference programs written in regular Julia code. Gen provides a built-in modeling language that is itself based on Julia.  This notebook uses the [Plots.jl](https://github.com/JuliaPlots/Plots.jl) Julia package for plotting.
+
+```@example tutorial_1
+using Plots
+```
+
+## [Generative Functions](@id gfi_tutorial_section)
+
+Probabilistic models are represented in Gen as *generative functions*.
+Generative functions are used to represent a variety of different types of
+probabilistic computations including generative models, inference models,
+custom proposal distributions, and variational approximations (see the [Gen
+documentation](https://www.gen.dev/docs/stable/ref/gfi/) or the 
+[paper](https://dl.acm.org/doi/10.1145/3314221.3314642)). In this
+tutorial,
+we focus on implementing _generative models_. A generative model represents
+a data-generating process; as such, it encodes any assumptions we have about
+our data and our problem domain.
+
+
+The simplest way to construct a generative function is by using the [built-in
+modeling DSL](https://www.gen.dev/docs/stable/ref/modeling/). Generative
+functions written in the built-in modeling DSL are based on Julia function
+definition syntax, but are prefixed with the `@gen` macro:
+
+```julia
+@gen function function_name_here(input_arguments)
+    # Function body...
+end
+```
+
+The function represents the data-generating process we are modeling.
+Conceptually, every time we run the function, it should generate a new
+"synthetic dataset" in line with our assumptions. Along the way, it will make
+random choices; each random choice it makes can be thought of as adding
+a random variable to a probabilistic model.
+
+Within the function body, most Julia code is permitted, but random choices use
+special syntax that annotates them with an _address_:
+
+```julia
+{addr} ~ distribution(parameters)
+```
+
+A simple example of such an invocation is a normal distribution parametrized
+with mean 0 and standard deviation 1:
+
+```julia
+my_variable = {:my_variable_address} ~ normal(0, 1)
+```
+
+Every random choice must be given an _address_, which can be
+an arbitrary value—but we often use a symbol. 
+(`:my_variable_address` is a symbol in the Julia language.)
+Think of the address as the name of a particular random choice, which
+is distinct from the name of the variable. For example, consider
+the following code:
+
+```julia
+x = {:initial_x} ~ normal(0, 1)
+if x < 0
+    x = x + ({:addition_to_x} ~ normal(2, 1))
+end
+```
+
+This code manipulates a single variable, `x`, but may make up to two random
+choices: `:initial_x` and `:addition_to_x`.
+
+
+Note that we can only use `~` to give addresses to _random choices_. 
+The following will _not_ work because the code is trying to trace the
+expression `sin(x)` which is an invocation of an ordinary Julia function, not
+a distribution. 
+```Julia
+# INVALID:
+my_variable = {:not_a_random_choice} ~ sin(x)
+```
+(We will see a bit later that it is _also_ possible to use `~`
+to sample from helper _generative functions_, not just primitive 
+distributions like `normal`. But for now, think of `~` as being
+for making random choices.)
+
+### Example 1: Linear regression
+
+Suppose we have a dataset of points $(x, y)$ in the plane, and we'd like 
+to infer a likely slope and intercept that explains their (linear) relationship.
+To approach this problem from a probabilistic perspective, we first need to
+develop a model. The model answers the question: how might this dataset have
+come to be? It also encodes our assumptions, e.g., our assumption that our
+data is explained by a linear relationship between $x$ and $y$.
+
+The generative function below represents a probabilistic model of a linear
+relationship in the x-y plane. Given a set of $x$ coordinates, it randomly
+chooses a line in the plane and generates corresponding $y$ coordinates so
+that each $(x, y)$ is near the line. We might think of this function as
+modeling house prices as a function of square footage, or the measured volume
+of a gas as a function of its measured temperature.
+
+
+```@example tutorial_1
+@gen function line_model(xs::Vector{Float64})
+    # We begin by sampling a slope and intercept for the line.
+    # Before we have seen the data, we don't know the values of
+    # these parameters, so we treat them as random choices. The
+    # distributions they are drawn from represent our prior beliefs
+    # about the parameters: in this case, that neither the slope nor the
+    # intercept will be more than a couple points away from 0.
+    slope = ({:slope} ~ normal(0, 1))
+    intercept = ({:intercept} ~ normal(0, 2))
+    
+    # We define a function to compute y for a given x
+    function y(x)
+        return slope * x + intercept
+    end
+
+    # Given the slope and intercept, we can sample y coordinates
+    # for each of the x coordinates in our input vector.
+    for (i, x) in enumerate(xs)
+        # Note that we name each random choice in this loop
+        # slightly differently: the first time through,
+        # the name (:y, 1) will be used, then (:y, 2) for
+        # the second point, and so on.
+        ({(:y, i)} ~ normal(y(x), 0.1))
+    end
+
+    # Most of the time, we don't care about the return
+    # value of a model, only the random choices it makes.
+    # It can sometimems be useful to return something
+    # meaningful, however; here, we return the function `y`.
+    return y
+end;
+```
+
+The generative function takes as an argument a vector of x-coordinates. We create one below:
+
+
+```@example tutorial_1
+xs = [-5., -4., -3., -2., -1., 0., 1., 2., 3., 4., 5.];
+```
+
+Given this vector, the generative function samples a random choice
+representing the slope of a line from a normal distribution with mean 0 and
+standard deviation 1, and a random choice representing the intercept of a
+line from a normal distribution with mean 0 and standard deviation 2. In
+Bayesian statistics terms, these distributions are the *prior distributions*
+of the slope and intercept respectively. Then, the function samples values
+for the y-coordinates corresponding to each of the provided x-coordinates.
+
+This generative function returns a function `y` encoding the slope
+and intercept. 
+We can run the model like we run a regular Julia function:
+
+
+```@example tutorial_1
+y = line_model(xs)
+```
+
+
+
+
+    y (generic function with 1 method)
+
+
+
+This gives us the return value of the model, but we
+may be more interested in _the values of the random choices_ that
+`line_model` makes. **Crucially, each random choice is annotated with a
+unique *address*.** A random choice is assigned an address using the `{addr} ~ ...`
+keyword. Addresses can be any Julia value. In this program, there are two
+types of addresses used -- Julia symbols and tuples of symbols and integers.
+Note that within the `for` loop, the same line of code is executed multiple
+times, but each time, the random choice it makes is given a distinct address.
+
+Although the random choices are not included in the return value, they *are*
+included in the *execution trace* of the generative function. We can run the
+generative function and obtain its trace using the [`
+simulate`](https://www.gen.dev/docs/stable/ref/gfi/#Gen.simulate) method
+from the Gen API:
+
+
+```@example tutorial_1
+trace = Gen.simulate(line_model, (xs,));
+```
+
+This method takes the function to be executed, and a tuple of arguments to the function, and returns a trace and a second value that we will not be using in this tutorial. When we print the trace, we see that it is a complex data structure.
+
+
+```@example tutorial_1
+println(trace)
+```
+
+
+A trace of a generative function contains various information about an execution of the function. For example, it contains the arguments on which the function was run, which are available with the API method `get_args`:
+
+
+```@example tutorial_1
+Gen.get_args(trace)
+```
+
+
+The trace also contains the value of the random choices, stored in a map from address to value called a *choice map*. This map is available through the API method [`get_choices`]():
+
+
+```@example tutorial_1
+Gen.get_choices(trace)
+```
+
+
+
+We can pull out individual values from this map using Julia's subscripting syntax `[...]`:
+
+
+```@example tutorial_1
+choices = Gen.get_choices(trace)
+choices[:slope]
+```
+
+We can also read the value of a random choice directly from the trace, without having to use `get_choices` first:
+
+
+```@example tutorial_1
+trace[:slope]
+```
+
+The return value is also recorded in the trace, and is accessible with the `get_retval` API method:
+
+
+```@example tutorial_1
+Gen.get_retval(trace)
+```
+
+Or we can access the return value directly from the trace via the syntactic sugar `trace[]`:
+
+
+```@example tutorial_1
+trace[]
+```
+
+
+In order to understand the probabilistic behavior of a generative function, it is helpful to be able to visualize its traces. Below, we define a function that uses PyPlot to render a trace of the generative function above. The rendering shows the x-y data points and the line that is represented by the slope and intercept choices.
+
+
+```@example tutorial_1
+function render_trace(trace; show_data=true)
+    
+    # Pull out xs from the trace
+    xs, = get_args(trace)
+    
+    xmin = minimum(xs)
+    xmax = maximum(xs)
+
+    # Pull out the return value, useful for plotting
+    y = get_retval(trace)
+    
+    # Draw the line
+    test_xs = collect(range(-5, stop=5, length=1000))
+    fig = plot(test_xs, map(y, test_xs), color="black", alpha=0.5, label=nothing,
+                xlim=(xmin, xmax), ylim=(xmin, xmax))
+
+    if show_data
+        ys = [trace[(:y, i)] for i=1:length(xs)]
+        
+        # Plot the data set
+        scatter!(xs, ys, c="black", label=nothing)
+    end
+    
+    return fig
+end;
+```
+
+
+```@example tutorial_1
+render_trace(trace)
+```
+
+    
+Because a generative function is stochastic, we need to visualize many runs in order to understand its behavior. The cell below renders a grid of traces.
+
+
+```@example tutorial_1
+function grid(renderer::Function, traces)
+    Plots.plot(map(renderer, traces)...)
+end;
+```
+
+
+```@example tutorial_1
+traces = [Gen.simulate(line_model, (xs,)) for _=1:12]
+grid(render_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+-------------------------------------------------
+### Exercise
+
+Write a generative function that uses the same address twice. Run it to see what happens.
+
+-------------------------------
+### Exercise
+
+Write a model that generates a sine wave with random phase, period and
+amplitude, and then generates y-coordinates from a given vector of
+x-coordinates by adding noise to the value of the wave at each x-coordinate.
+Use a  `gamma(1, 1)` prior distribution for the period, and a `gamma(1, 1)`
+prior distribution on the amplitude (see
+[`Gen.gamma`](https://www.gen.dev/docs/stable/ref/distributions/#Gen.gamma)).
+Sampling from a Gamma distribution will ensure to give us postive real values.
+Use a uniform distribution between 0 and $2\pi$ for the phase (see
+[`Gen.uniform`](https://www.gen.dev/docs/stable/ref/distributions/#Gen.uniform)).
+
+The sine wave should implement:
+
+$ y(x) = a \sin(2\pi \frac{1}{p} x + \varphi)$,
+
+where $a$ is the amplitude, $p$ is the period and $\varphi$ is the phase.  In
+Julia the constant $\pi$ can be expressed as either `pi` or `π` (unicode).
+
+When calling `trace = Gen.simulate(sine_model, (xs,))`, the following choices should appear:
+- amplitude: `trace[:amplitude]`
+- period: `trace[:period]`
+- phase: `trace[:phase]`
+
+We have provided some starter code for the sine wave model:
+
+
+```@example tutorial_1
+@gen function sine_model(xs::Vector{Float64})
+    
+    # < your code here, for sampling a phase, period, and amplitude >
+
+    function y(x)
+        return 1 # < Edit this line to compute y for a given x >
+    end
+    
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(y(x), 0.1)
+    end
+    
+    return y # We return the y function so it can be used for plotting, below. 
+end;
+```
+
+
+```@example tutorial_1
+traces = [Gen.simulate(sine_model, (xs,)) for _=1:12];
+```
+
+
+```@example tutorial_1
+grid(render_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+<hr>
+<!-- # Solution:
+@gen function sine_model(xs)
+    period = {:period} ~ gamma(1, 1)
+    amplitude = {:amplitude} ~ gamma(1, 1)
+    phase = {:phase} ~ uniform(0, 2*pi)
+
+    # Define a deterministic sine wave with the values above
+    function y(x)
+        return amplitude * sin(x * (2 * pi / period) + phase)
+    end
+
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(y(x), 0.1)
+    end
+
+    return y
+end; -->
+
+## Posterior Inference
+
+Of course, we don't really care about generating lots of pictures of lines
+(or sine waves). We'd really like to begin with an actual dataset of observed
+$(x, y)$ points, and infer the corresponding slope and intercept (or phase,
+period, and amplitude). This task is called _posterior inference_.
+
+We now will provide a data set of y-coordinates and try to draw inferences
+about the process that generated the data. We begin with the following data
+set:
+
+
+```@example tutorial_1
+ys = [6.75003, 6.1568, 4.26414, 1.84894, 3.09686, 1.94026, 1.36411, -0.83959, -0.976, -1.93363, -2.91303];
+```
+
+
+```@example tutorial_1
+scatter(xs, ys, color="black", label=nothing, title="Observed data (linear)", xlabel="X", ylabel="Y")
+```
+
+
+
+
+    
+    
+
+
+
+We will assume that the line model was responsible for generating the data,
+and infer values of the slope and intercept that explain the data.
+
+To do this, we write a simple *inference program* that takes the model we are
+assuming generated our data, the data set, and the amount of computation to
+perform, and returns a trace of the function that is approximately sampled
+from the _posterior distribution_ on traces of the function, given the
+observed data. That is, the inference program will try to find a trace that
+well explains the dataset we created above. We can inspect that trace to find
+estimates of the slope and intercept of a line that fits the data.
+
+Functions like `importance_resampling` expect us to provide a _model_ and
+also an _choice map_ representing our data set and relating it to the model.
+A choice map maps random choice addresses from the model to values from our
+data set. Here, we want to tie model addresses like `(:y, 4)` to data set
+values like `ys[4]`:
+
+
+```@example tutorial_1
+function do_inference(model, xs, ys, amount_of_computation)
+    
+    # Create a choice map that maps model addresses (:y, i)
+    # to observed values ys[i]. We leave :slope and :intercept
+    # unconstrained, because we want them to be inferred.
+    observations = Gen.choicemap()
+    for (i, y) in enumerate(ys)
+        observations[(:y, i)] = y
+    end
+    
+    # Call importance_resampling to obtain a likely trace consistent
+    # with our observations.
+    (trace, _) = Gen.importance_resampling(model, (xs,), observations, amount_of_computation);
+    return trace
+end;
+```
+
+We can run the inference program to obtain a trace, and then visualize the result:
+
+
+```@example tutorial_1
+trace = do_inference(line_model, xs, ys, 100)
+render_trace(trace)
+```
+
+
+
+
+    
+    
+
+
+
+We see that `importance_resampling` found a reasonable slope and intercept to explain the data. We can also visualize many samples in a grid:
+
+
+```@example tutorial_1
+traces = [do_inference(line_model, xs, ys, 100) for _=1:10];
+grid(render_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+We can see here that there is some uncertainty: with our limited data, we can't be 100% sure exactly where the line is. We can get a better sense for the variability in the posterior distribution by visualizing all the traces in one plot, rather than in a grid. Each trace is going to have the same observed data points, so we only plot those once, based on the values in the first trace:
+
+
+```@example tutorial_1
+function overlay(renderer, traces; same_data=true, args...)
+    fig = renderer(traces[1], show_data=true, args...)
+    
+    xs, = get_args(traces[1])
+    xmin = minimum(xs)
+    xmax = maximum(xs)
+
+    for i=2:length(traces)
+        y = get_retval(traces[i])
+        test_xs = collect(range(-5, stop=5, length=1000))
+        fig = plot!(test_xs, map(y, test_xs), color="black", alpha=0.5, label=nothing,
+                    xlim=(xmin, xmax), ylim=(xmin, xmax))
+    end
+    return fig
+end;
+```
+
+
+```@example tutorial_1
+traces = [do_inference(line_model, xs, ys, 100) for _=1:10];
+overlay(render_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+--------------
+
+### Exercise
+
+The results above were obtained for `amount_of_computation = 100`. Run the algorithm with this value set to `1`, `10`, and `1000`, etc.  Which value seems like a good tradeoff between accuracy and running time? Discuss.
+
+------------------
+
+### Exercise
+Consider the following data set.
+
+
+```@example tutorial_1
+ys_sine = [2.89, 2.22, -0.612, -0.522, -2.65, -0.133, 2.70, 2.77, 0.425, -2.11, -2.76];
+```
+
+
+```@example tutorial_1
+scatter(xs, ys_sine, color="black", label=nothing)
+```
+
+
+
+
+    
+    
+
+
+
+Write an inference program that generates traces of `sine_model` that explain this data set. Visualize the resulting distribution of traces. Temporarily change the prior distribution on the period to be `gamma(1, 1)`  (by changing and re-running the cell that defines `sine_model` from a previous exercise). Can you explain the difference in inference results when using `gamma(1, 1)` vs `gamma(5, 1)` prior on the period? How much computation did you need to get good results?
+
+## Predicting new data
+
+What if we'd want to predict `ys` given `xs`?
+
+Using the API method
+[`generate`](https://www.gen.dev/docs/dev/ref/gfi/#Gen.generate), we
+can generate a trace of a generative function in which the values of certain
+random choices are constrained to given values. The constraints are a choice
+map that maps the addresses of the constrained random choices to their
+desired values.
+
+For example:
+
+
+
+```@example tutorial_1
+constraints = Gen.choicemap()
+constraints[:slope] = 0.
+constraints[:intercept] = 0.
+(trace, _) = Gen.generate(line_model, (xs,), constraints)
+render_trace(trace)
+```
+
+
+
+
+    
+    
+
+
+
+Note that the random choices corresponding to the y-coordinates are still made randomly. Run the cell above a few times to verify this.
+
+
+We will use the ability to run constrained executions of a generative
+function to predict the value of the y-coordinates at new x-coordinates by
+running new executions of the model generative function in which the random
+choices corresponding to the parameters have been constrained to their
+inferred values.  We have provided a function below (`predict_new_data`) that
+takes a trace, and a vector of new x-coordinates, and returns a vector of
+predicted y-coordinates corresponding to the x-coordinates in `new_xs`. We
+have designed this function to work with multiple models, so the set of
+parameter addresses is an argument (`param_addrs`):
+
+
+
+```@example tutorial_1
+function predict_new_data(model, trace, new_xs::Vector{Float64}, param_addrs)
+    
+    # Copy parameter values from the inferred trace (`trace`)
+    # into a fresh set of constraints.
+    constraints = Gen.choicemap()
+    for addr in param_addrs
+        constraints[addr] = trace[addr]
+    end
+    
+    # Run the model with new x coordinates, and with parameters 
+    # fixed to be the inferred values.
+    (new_trace, _) = Gen.generate(model, (new_xs,), constraints)
+    
+    # Pull out the y-values and return them.
+    ys = [new_trace[(:y, i)] for i=1:length(new_xs)]
+    return ys
+end;
+```
+
+To illustrate, we call the function above given the previous trace (which
+constrained slope and intercept to be zero).
+
+
+```@example tutorial_1
+predict_new_data(line_model, trace, [1., 2., 3.], [:slope, :intercept])
+```
+
+
+
+
+    3-element Vector{Float64}:
+      0.02694273341948937
+     -0.05578345643577398
+      0.0031286541463625053
+
+
+
+The cell below defines a function that first performs inference on an
+observed data set `(xs, ys)`, and then runs `predict_new_data` to generate
+predicted y-coordinates. It repeats this process `num_traces` times, and
+returns a vector of the resulting y-coordinate vectors.
+
+
+```@example tutorial_1
+function infer_and_predict(model, xs, ys, new_xs, param_addrs, num_traces, amount_of_computation)
+    pred_ys = []
+    for i=1:num_traces
+        trace = do_inference(model, xs, ys, amount_of_computation)
+        push!(pred_ys, predict_new_data(model, trace, new_xs, param_addrs))
+    end
+    pred_ys
+end;
+```
+
+To illustrate, we generate predictions at `[1., 2., 3.]` given one (approximate) posterior
+trace.
+
+
+```@example tutorial_1
+pred_ys = infer_and_predict(line_model, xs, ys, [1., 2., 3.], [:slope, :intercept], 1, 1000)
+```
+
+
+
+
+    1-element Vector{Any}:
+     [1.0032271405440183, -0.13506083352207435, -0.9583356345851033]
+
+
+
+Finally, we define a cell that plots the observed data set `(xs, ys)` as red dots, and the predicted data as small black dots.
+
+
+```@example tutorial_1
+function plot_predictions(xs, ys, new_xs, pred_ys; title="predictions")
+    fig = scatter(xs, ys, color="red", label="observed data", title=title)
+    for (i, pred_ys_single) in enumerate(pred_ys)
+        scatter!(new_xs, pred_ys_single, color="black", alpha=0.1, label=i == 1 ? "predictions" : nothing)
+    end
+    return fig
+end;
+```
+
+Recall the original dataset for the line model. The x-coordinates span the interval -5 to 5.
+
+
+```@example tutorial_1
+scatter(xs, ys, color="red", label="observed data")
+```
+
+
+
+
+    
+    
+
+
+
+We will use the inferred values of the parameters to predict y-coordinates for x-coordinates in the interval 5 to 10 from which data was not observed. We will also predict new data within the interval -5 to 5, and we will compare this data to the original observed data. Predicting new data from inferred parameters, and comparing this new data to the observed data is the core idea behind *posterior predictive checking*. This tutorial does not intend to give a rigorous overview behind techniques for checking the quality of a model, but intends to give high-level intuition.
+
+
+```@example tutorial_1
+new_xs = collect(range(-5, stop=10, length=100));
+```
+
+We generate and plot the predicted data:
+
+
+```@example tutorial_1
+pred_ys = infer_and_predict(line_model, xs, ys, new_xs, [:slope, :intercept], 20, 1000)
+plot_predictions(xs, ys, new_xs, pred_ys)
+```
+
+
+
+
+    
+    
+
+
+
+The results look reasonable, both within the interval of observed data and in the extrapolated predictions on the right.
+
+Now consider the same experiment run with the following data set, which has significantly more noise.
+
+
+```@example tutorial_1
+ys_noisy = [5.092, 4.781, 2.46815, 1.23047, 0.903318, 1.11819, 2.10808, 1.09198, 0.0203789, -2.05068, 2.66031];
+```
+
+
+```@example tutorial_1
+pred_ys = infer_and_predict(line_model, xs, ys_noisy, new_xs, [:slope, :intercept], 20, 1000)
+plot_predictions(xs, ys_noisy, new_xs, pred_ys)
+```
+
+
+
+
+    
+    
+
+
+
+It looks like the generated data is less noisy than the observed data in the regime where data was observed, and it looks like the forecasted data is too overconfident. This is a sign that our model is mis-specified. In our case, this is because we have assumed that the noise has value 0.1. However, the actual noise in the data appears to be much larger. We can correct this by making the noise a random choice as well and inferring its value along with the other parameters.
+
+We first write a new version of the line model that samples a random choice for the noise from a `gamma(1, 1)` prior distribution.
+
+
+```@example tutorial_1
+@gen function line_model_fancy(xs::Vector{Float64})
+    slope = ({:slope} ~ normal(0, 1))
+    intercept = ({:intercept} ~ normal(0, 2))
+    
+    function y(x)
+        return slope * x + intercept
+    end
+    
+    noise = ({:noise} ~ gamma(1, 1))
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(slope * x + intercept, noise)
+    end
+    return y
+end;
+```
+
+Then, we compare the predictions using inference of the unmodified and modified models on the `ys` data set:
+
+
+```@example tutorial_1
+pred_ys = infer_and_predict(line_model, xs, ys, new_xs, [:slope, :intercept], 20, 1000)
+fixed_noise_plot = plot_predictions(xs, ys, new_xs, pred_ys; title="fixed noise")
+
+pred_ys = infer_and_predict(line_model_fancy, xs, ys, new_xs, [:slope, :intercept, :noise], 20, 10000)
+inferred_noise_plot = plot_predictions(xs, ys, new_xs, pred_ys; title="inferred noise")
+
+plot(fixed_noise_plot, inferred_noise_plot)
+```
+
+
+
+
+    
+    
+
+
+
+Notice that there is more uncertainty in the predictions made using the modified model.
+
+We also compare the predictions using inference of the unmodified and modified models on the `ys_noisy` data set:
+
+
+```@example tutorial_1
+pred_ys = infer_and_predict(line_model, xs, ys_noisy, new_xs, [:slope, :intercept], 20, 1000)
+fixed_noise_plot = plot_predictions(xs, ys_noisy, new_xs, pred_ys; title="fixed noise")
+
+
+pred_ys = infer_and_predict(line_model_fancy, xs, ys_noisy, new_xs, [:slope, :intercept, :noise], 20, 10000)
+inferred_noise_plot = plot_predictions(xs, ys_noisy, new_xs, pred_ys; title="inferred noise")
+
+plot(fixed_noise_plot, inferred_noise_plot)
+```
+
+
+
+
+    
+    
+
+
+
+Notice that while the unmodified model was very overconfident, the modified model has an appropriate level of uncertainty, while still capturing the general negative trend.
+
+-------------------------
+### Exercise
+
+Write a modified version of the sine model that makes noise into a random choice. Compare the predicted data with the observed data using `infer_and_predict` and `plot_predictions` for the unmodified and modified models, and for the `ys_sine` and `ys_noisy` data sets. Discuss the results. Experiment with the amount of inference computation used. The amount of inference computation will need to be higher for the model with the noise as a random choice.
+
+We have provided you with starter code:
+
+
+```@example tutorial_1
+@gen function sine_model_fancy(xs::Vector{Float64})
+
+    # < your code here >
+
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(0., 0.1) # < edit this line >
+    end
+    return nothing
+end;
+```
+
+
+```@example tutorial_1
+# Modify the line below to experiment with the amount_of_computation parameter
+pred_ys = infer_and_predict(sine_model, xs, ys_sine, new_xs, [], 20, 1)
+fixed_noise_plot = plot_predictions(xs, ys_sine, new_xs, pred_ys; title="Fixed noise level")
+
+# Modify the line below to experiment with the amount_of_computation parameter
+pred_ys = infer_and_predict(sine_model_fancy, xs, ys_sine, new_xs, [], 20, 1)
+inferred_noise_plot = plot_predictions(xs, ys_sine, new_xs, pred_ys; title="Inferred noise level")
+
+Plots.plot(fixed_noise_plot, inferred_noise_plot)
+```
+
+
+```@example tutorial_1
+# Modify the line below to experiment with the amount_of_computation parameter
+pred_ys = infer_and_predict(sine_model, xs, ys_noisy, new_xs, [], 20, 1)
+fixed_noise_plot = plot_predictions(xs, ys_noisy, new_xs, pred_ys; title="Fixed noise level")
+
+# Modify the line below to experiment with the amount_of_computation parameter
+pred_ys = infer_and_predict(sine_model_fancy, xs, ys_noisy, new_xs, [], 20, 1)
+inferred_noise_plot = plot_predictions(xs, ys_noisy, new_xs, pred_ys; title="Inferred noise level")
+
+Plots.plot(fixed_noise_plot, inferred_noise_plot)
+```
+
+<hr>
+<!-- # Solution
+@gen function sine_model_fancy(xs::Vector{Float64})
+    period = ({:period} ~ gamma(5, 1))
+    amplitude = ({:amplitude} ~ gamma(1, 1))
+    phase = ({:phase} ~ uniform(0, 2*pi))
+    noise = ({:noise} ~ gamma(1, 1))
+
+    function y(x)
+        return amplitude * sin(x * (2 * pi / period) + phase)
+    end
+
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(y(x), noise)
+    end
+
+    return y
+end; -->
+
+## Calling other generative functions
+
+In addition to making random choices, generative functions can invoke other
+generative functions. To illustrate this, we will write a probabilistic model
+that combines the line model and the sine model. This model is able to
+explain data using either model, and which model is chosen will depend on the
+data. This is called *model selection*.
+
+A generative function can invoke another generative function in three ways:
+
+- **(NOT RECOMMENDED)** using regular Julia function call syntax: `f(x)`
+
+- using the `~` snytax with an address for the call: `{addr} ~ f(x)`
+
+- using the `~` syntax with a wildcard address: `{*} ~ f(x)`
+
+When invoking using regular function call syntax, the random choices made by
+the callee function are not traced at all, and Gen cannot reason about them during inference. 
+When invoking using `~` with a _wildcard_ address (`{*} ~ f(x)`), the random choices of the 
+callee function are imported directly into the caller's trace. So, for example, if `f` makes a choice
+called `:f_choice`, then the caller's trace will have a choice called `:f_choice` too. 
+Note that a downside of this is that if `f` is called _twice_ by the same caller, then the two 
+choices called `:f_choice` will clash, leading to an error.
+In this case, it is best to provide an address (`{addr} ~ f(x)`): `f`'s random choices will
+be placed under the _namespace_ `addr`. 
+
+
+```@example tutorial_1
+@gen function foo()
+    {:y} ~ normal(0, 1)
+end
+
+@gen function bar()
+    {:x} ~ bernoulli(0.5)
+    # Call `foo` with a wildcard address.
+    # Its choices (:y) will appear directly
+    # within the trace of `bar`.
+    {*} ~ foo()
+end
+
+@gen function bar_using_namespace()
+    {:x} ~ bernoulli(0.5)
+    # Call `foo` with the address `:z`.
+    # The internal choice `:y` of `foo`
+    # will appear in our trace at the
+    # hierarchical address `:z => :y`.
+    {:z} ~ foo()
+end;
+```
+
+We first show the addresses sampled by `bar`:
+
+
+```@example tutorial_1
+trace = Gen.simulate(bar, ())
+Gen.get_choices(trace)
+```
+
+
+
+
+    │
+    ├── :y : 0.6475752978557953
+    │
+    └── :x : true
+
+
+
+
+And the addresses sampled by `bar_using_namespace`:
+
+
+```@example tutorial_1
+trace = Gen.simulate(bar_using_namespace, ())
+Gen.get_choices(trace)
+```
+
+
+
+
+    │
+    ├── :x : true
+    │
+    └── :z
+        │
+        └── :y : 0.3801328264226968
+
+
+
+
+Using `{addr} ~ f()`, with a namespace, can help avoid address collisions for complex models.
+
+A hierarchical address is represented as a Julia `Pair`, where the first element of the pair is the first element of the address and the second element of the pair is the rest of the address:
+
+
+```@example tutorial_1
+trace[Pair(:z, :y)]
+```
+
+
+
+
+    0.3801328264226968
+
+
+
+Julia uses the `=>` operator as a shorthand for the `Pair` constructor, so we can access choices at hierarchical addresses like:
+
+
+```@example tutorial_1
+trace[:z => :y]
+```
+
+
+
+
+    0.3801328264226968
+
+
+
+If we have a hierarchical address with more than two elements, we can construct the address by chaining the `=>` operator:
+
+
+```@example tutorial_1
+@gen function baz()
+    {:a} ~ bar_using_namespace()
+end
+
+trace = simulate(baz, ())
+
+trace[:a => :z => :y]
+```
+
+
+
+
+    0.5622991768999647
+
+
+
+Note that the `=>` operator associated right, so this is equivalent to:
+
+
+```@example tutorial_1
+trace[Pair(:a, Pair(:z, :y))]
+```
+
+
+
+
+    0.5622991768999647
+
+
+
+Now, we write a generative function that combines the line and sine models. It makes a Bernoulli random choice (e.g. a coin flip that returns true or false) that determines which of the two models will generate the data.
+
+
+```@example tutorial_1
+@gen function combined_model(xs::Vector{Float64})
+    if ({:is_line} ~ bernoulli(0.5))
+        # Call line_model_fancy on xs, and import
+        # its random choices directly into our trace.
+        return ({*} ~ line_model_fancy(xs))
+    else
+        # Call sine_model_fancy on xs, and import
+        # its random choices directly into our trace
+        return ({*} ~ sine_model_fancy(xs))
+    end
+end;
+```
+
+We visualize some traces, and see that sometimes it samples linear data and other times sinusoidal data.
+
+
+```@example tutorial_1
+traces = [Gen.simulate(combined_model, (xs,)) for _=1:12];
+grid(render_trace, traces)
+```
+    
+
+We run inference using this combined model on the `ys` data set and the `ys_sine` data set. 
+
+
+```@example tutorial_1
+traces = [do_inference(combined_model, xs, ys, 10000) for _=1:10];
+linear_dataset_plot = overlay(render_trace, traces)
+traces = [do_inference(combined_model, xs, ys_sine, 10000) for _=1:10];
+sine_dataset_plot = overlay(render_trace, traces)
+Plots.plot(linear_dataset_plot, sine_dataset_plot)
+```
+
+
+The results should show that the line model was inferred for the `ys` data set, and the sine wave model was inferred for the `ys_sine` data set.
+
+-------
+
+### Exercise
+
+Construct a data set for which it is ambiguous whether the line or sine wave model is best. Visualize the inferred traces using `render_combined` to illustrate the ambiguity. Write a program that takes the data set and returns an estimate of the posterior probability that the data was generated by the sine wave model, and run it on your data set.
+
+Hint: To estimate the posterior probability that the data was generated by the sine wave model, run the inference program many times to compute a large number of traces, and then compute the fraction of those traces in which `:is_line` is false.
+
+<hr>
+
+## Modeling with an unbounded number of parameters
+
+Gen's built-in modeling language can be used to express models that use an
+unbounded number of parameters. This section walks you through development of
+a model of data that does not a-priori specify an upper bound on the
+complexity of the model, but instead infers the complexity of the model as
+well as the parameters. This is a simple example of a *Bayesian
+nonparametric* model.
+
+We will consider two data sets:
+
+
+```@example tutorial_1
+xs_dense = collect(range(-5, stop=5, length=50))
+ys_simple = fill(1., length(xs_dense)) .+ randn(length(xs_dense)) * 0.1
+ys_complex = [Int(floor(abs(x/3))) % 2 == 0 ? 2 : 0 for x in xs_dense] .+ randn(length(xs_dense)) * 0.1;
+```
+
+
+```@example tutorial_1
+simple_plot = scatter(xs_dense, ys_simple, color="black", label=nothing, title="ys-simple", ylim=(-1, 3))
+complex_plot = scatter(xs_dense, ys_complex, color="black", label=nothing, title="ys-complex", ylim=(-1, 3))
+Plots.plot(simple_plot, complex_plot)
+```
+
+
+
+
+    
+    
+
+
+
+The data set on the left appears to be best explained as a contant function
+with some noise. The data set on the right appears to include two
+changepoints, with a constant function in between the changepoints. We want a
+model that does not a-priori choose the number of changepoints in the data.
+To do this, we will recursively partition the interval into regions. We
+define a Julia data structure that represents a binary tree of intervals;
+each leaf node represents a region in which the function is constant.
+
+
+```@example tutorial_1
+struct Interval
+    l::Float64
+    u::Float64
+end
+```
+
+
+```@example tutorial_1
+abstract type Node end
+    
+struct InternalNode <: Node
+    left::Node
+    right::Node
+    interval::Interval
+end
+
+struct LeafNode <: Node
+    value::Float64
+    interval::Interval
+end
+```
+
+We now write a generative function that randomly creates such a tree. Note the use of recursion in this function to create arbitrarily large trees representing arbitrarily many changepoints. Also note that we assign the address namespaces `:left` and `:right` to the calls made for the two recursive calls to `generate_segments`.
+
+
+```@example tutorial_1
+@gen function generate_segments(l::Float64, u::Float64)
+    interval = Interval(l, u)
+    if ({:isleaf} ~ bernoulli(0.7))
+        value = ({:value} ~ normal(0, 1))
+        return LeafNode(value, interval)
+    else
+        frac = ({:frac} ~ beta(2, 2))
+        mid  = l + (u - l) * frac
+        # Call generate_segments recursively!
+        # Because we will call it twice -- one for the left 
+        # child and one for the right child -- we use
+        # addresses to distinguish the calls.
+        left = ({:left} ~ generate_segments(l, mid))
+        right = ({:right} ~ generate_segments(mid, u))
+        return InternalNode(left, right, interval)
+    end
+end;
+```
+
+We also define some helper functions to visualize traces of the `generate_segments` function.
+
+
+```@example tutorial_1
+function render_node!(node::LeafNode)
+    plot!([node.interval.l, node.interval.u], [node.value, node.value], label=nothing, linewidth=5)
+end
+
+function render_node!(node::InternalNode)
+    render_node!(node.left)
+    render_node!(node.right)
+end;
+```
+
+
+```@example tutorial_1
+function render_segments_trace(trace; xlim=(0,1))
+    node = get_retval(trace)
+    fig = plot(xlim=xlim, ylim=(-3, 3))
+    render_node!(node)
+    return fig
+end;
+```
+
+We generate 12 traces from this function and visualize them below. We plot the piecewise constant function that was sampled by each run of the generative function. Different constant segments are shown in different colors. Run the cell a few times to get a better sense of the distribution on functions that is represented by the generative function.
+
+
+```@example tutorial_1
+traces = [Gen.simulate(generate_segments, (0., 1.)) for i=1:12]
+grid(render_segments_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+Because we only sub-divide an interval with 30% probability, most of these sampled traces have only one segment.
+
+Now that we have a generative function that generates a random piecewise-constant function, we write a model that adds noise to the resulting constant functions to generate a data set of y-coordinates. The noise level will be a random choice.
+
+
+```@example tutorial_1
+# get_value_at searches a binary tree for
+# the leaf node containing some value.
+function get_value_at(x::Float64, node::LeafNode)
+    @assert x >= node.interval.l && x <= node.interval.u
+    return node.value
+end
+
+function get_value_at(x::Float64, node::InternalNode)
+    @assert x >= node.interval.l && x <= node.interval.u
+    if x <= node.left.interval.u
+        get_value_at(x, node.left)
+    else
+        get_value_at(x, node.right)
+    end
+end
+
+# Our full model
+@gen function changepoint_model(xs::Vector{Float64})
+    node = ({:tree} ~ generate_segments(minimum(xs), maximum(xs)))
+    noise = ({:noise} ~ gamma(0.5, 0.5))
+    for (i, x) in enumerate(xs)
+        {(:y, i)} ~ normal(get_value_at(x, node), noise)
+    end
+    return node
+end;
+```
+
+We write a visualization for `changepoint_model` below:
+
+
+```@example tutorial_1
+function render_changepoint_model_trace(trace; show_data=true)
+    xs = Gen.get_args(trace)[1]
+    node = Gen.get_retval(trace)
+    fig = render_segments_trace(trace; xlim=(minimum(xs), maximum(xs)))
+    render_node!(node)
+    if show_data
+        ys = [trace[(:y, i)] for i=1:length(xs)]
+        scatter!(xs, ys, c="gray", label=nothing, alpha=0.3, markersize=3)
+    end
+end;
+```
+
+Finally, we generate some simulated data sets and visualize them on top of the underlying piecewise constant function from which they were generated:
+
+
+```@example tutorial_1
+traces = [Gen.simulate(changepoint_model, (xs_dense,)) for i=1:12]
+grid(render_changepoint_model_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+Notice that the amount of variability around the piecewise constant mean function differs from trace to trace.
+
+Now we perform inference for the simple data set:
+
+
+```@example tutorial_1
+traces = [do_inference(changepoint_model, xs_dense, ys_simple, 10000) for _=1:12];
+grid(render_changepoint_model_trace, traces)
+```
+
+
+
+
+    
+    
+
+
+
+We see that we inferred that the mean function that explains the data is a constant with very high probability.
+
+For inference about the complex data set, we use more computation. You can experiment with different amounts of computation to see how the quality of the inferences degrade with less computation. Note that we are using a very simple generic inference algorithm in this tutorial, which really isn't suited for this more complex task. In later tutorials, we will learn how to write more efficient algorithms, so that accurate results can be obtained with significantly less computation. We will also see ways of annotating the model for better performance, no matter the inference algorithm.
+
+!!! warning
+    The following cell may run for 2-3 minutes.
+
+
+```@example tutorial_1
+traces = [do_inference(changepoint_model, xs_dense, ys_complex, 100000) for _=1:12];
+grid(render_changepoint_model_trace, traces)
+```
+
+
+
+The results show that more segments are inferred for the more complex data set.
+
+------
+### Exercise
+Write a function that takes a data set of x- and y-coordinates and plots the histogram of the probability distribution on the number of changepoints.
+Show the results for the `ys_simple` and `ys_complex` data sets.
+
+Hint: The return value of `changepoint_model` is the tree of `Node` values. Walk this tree.
+
+-------
+
+### Exercise
+Write a new version of `changepoint_model` that uses `{*} ~ ...` without an address to make the recursive calls.
+
+Hint: You will need to guarantee that all addresses are unique. How can you label each node in a binary tree using an integer?
\ No newline at end of file
diff --git a/docs/src/tutorials/basics/particle_filter.md b/docs/src/tutorials/basics/particle_filter.md
new file mode 100644
index 000000000..4da7ac27d
--- /dev/null
+++ b/docs/src/tutorials/basics/particle_filter.md
@@ -0,0 +1,784 @@
+# [Object Tracking with Particle Filters](@id object_tracking_with_particle_filter_tutorial)
+
+## Introduction
+So far, we've seen two general classes of inference algorithm, importance sampling and MCMC.
+Very informally, and focusing only on one aspect of the algorithms, we might describe them
+as follows:
+
+* *Importance Sampling*: guesses solutions "all at once" using a proposal distribution.
+  That proposal may be "smart" (e.g., a neural network), but still guesses an entire solution
+  in one go. We make many guesses, and weight them according to the importance weighting formula.
+
+* *MCMC*: beginning with an initial guess, iteratively refine the guess to explore the space of 
+  possible solutions. At every iteration, the current state is an entire proposed solution to the
+  problem.
+
+In this notebook, we will explore a third paradigm: **Sequential Monte Carlo**. SMC methods,
+such as particle filtering, iteratively solve a *sequence of inference problems* using techniques 
+based on importance sampling and in some cases MCMC [^1][^2]. The solution to each problem in the sequence 
+is represented as a collection of samples or *particles*. The particles for each problem are based on 
+extending or adjusting the particles for the previous problem in the sequence.
+
+The sequence of inference problems that are solved often arise naturally from
+observations that arrive incrementally, as in _particle filtering_. Particle filtering
+algorithms are a subclass of SMC algorithms, often applied to state-space models in
+which we observe an evolving process over time. We begin by only considering the first
+time step, inferring the latent variables at that time step given that time step's
+observations. We then consider a slightly more difficult inference problem: joint
+inference of the first _two_ time steps' latent variables, given both time steps'
+observations. And so on, until the observations stop.
+
+But SMC is a more general algorithm than the particle filter might suggest.
+Sometimes, the sequence of problems does not arise from data arriving incrementally,
+but is rather constructed instrumentally to facilitate inference, as
+in annealed importance sampling [^3]. 
+
+However, this notebook focuses on particle filtering for a typical tracking problem.
+We show how Gen's support for SMC integrates with its support for MCMC, enabling
+"rejuvenation" MCMC moves. Specifically, we will address the
+"bearings only tracking" problem described in [^4]. 
+
+This notebook will also introduce you to the 
+[`Unfold`](@ref Unfold) combinator, 
+which can be used to improve performance of SMC.
+`Unfold` is just one example of the levers that Gen provides for
+improving performance; once you understand it, you can check
+Gen's documentation to see how similar principles apply to the 
+[`Map`](https://www.gen.dev/docs/dev/ref/combinators/#Map-combinator-1) combinator 
+and to the static DSL. (These features are also covered in the previous tutorial,
+[Scaling with the Static Modeling Language](@ref scaling_with_sml_tutorial))
+
+**Prerequisites for this tutorial**
+- [Generative Combinators](@ref combinators_tutorial)
+- [Static Modeling Language](@ref scaling_with_sml_tutorial) for the second half of the tutorial.
+
+
+[^1]: Doucet, Arnaud, Nando De Freitas, and Neil Gordon. "An introduction to sequential Monte Carlo methods." Sequential Monte Carlo methods in practice. Springer, New York, NY, 2001. 3-14.
+
+[^2]: Del Moral, Pierre, Arnaud Doucet, and Ajay Jasra. "Sequential Monte Carlo samplers." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 68.3 (2006): 411-436.
+
+[^3]: Neal, Radford M. "Annealed importance sampling." Statistics and computing 11.2 (2001): 125-139.
+
+[^4]: Gilks, Walter R., and Carlo Berzuini. "Following a moving target—Monte Carlo inference for dynamic Bayesian models." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 63.1 (2001): 127-146. [PDF](http://www.mathcics.emory.edu/~whalen/Papers/BNs/MonteCarlo-DBNs.pdf)
+
+## Implementing the Generative Model
+
+We will implement a generative model for the movement of a point in the x-y
+plane and bearing measurements of the location of this point relative to the
+origin over time. We imagine, for example, that we are located at the origin,
+and can measure the location of a far-away ship (the object we are tracking) only
+by measuring its _bearing_ relative to us, i.e., the angle formed with the x axis
+by the ray connecting us to the ship. We would like to infer its (x, y) position
+over time.
+
+We assume that we know the approximate initial position and velocity of the
+ship. We assume the point's x- and y- velocity are subject to random
+perturbations drawn from some normal distribution with a known variance. Each
+bearing measurement consists of the angle of the point being tracked relative
+to the positive x-axis.
+
+We write the generative model as a generative function below. The function
+first samples the initial state of the ship from a prior distribution, and
+then generates `T` successive states in a `for` loop. The argument to the
+model (`T`) is the number of time steps not including the initial state.
+
+
+```@example particle_filter_tutorial
+using Gen, Plots
+bearing(x, y) = atan(y, x)
+
+@gen function model(T::Int)
+
+    measurement_noise = 0.005
+    velocity_var = 1e-6
+
+    xs = Vector{Float64}(undef, T+1)
+    ys = Vector{Float64}(undef, T+1)
+
+    # prior on initial x-coordinate
+    x = {:x0} ~ normal(0.01, 0.01)
+
+    # prior on initial y-coordinate
+    y = {:y0} ~ normal(0.95, 0.01)
+
+    # prior on x-component of initial velocity
+    vx = {:vx0} ~ normal(0.002, 0.01)
+
+    # prior on y-component of initial velocity
+    vy = {:vy0} ~ normal(-0.013, 0.01)
+
+    # initial bearing measurement
+    z0 ~ normal(bearing(x, y), measurement_noise)
+
+    # record position
+    xs[1] = x
+    ys[1] = y
+
+    # generate successive states and measurements
+    for t=1:T
+
+        # update the state of the point
+        vx = {(:vx, t)} ~ normal(vx, sqrt(velocity_var))
+        vy = {(:vy, t)} ~ normal(vy, sqrt(velocity_var))
+        x += vx
+        y += vy
+
+        # bearing measurement
+        {(:z, t)} ~ normal(bearing(x, y), measurement_noise)
+
+        # record position
+        xs[t+1] = x
+        ys[t+1] = y
+    end
+
+    # return the sequence of positions
+    return (xs, ys)
+end;
+```
+
+Note that the `model` function itself uses mutation to evolve the variables
+`x`, `y`, `vx`, and `vy` over time. The `{addr} ~ distribution()` syntax keeps
+the names of traced random variables (for which each address may only be used
+once) separate from the names of program variables, like `x`, which may be
+reassigned multiple times during the function's execution.
+
+We generate a data set of positions, and observed bearings, by sampling from
+this model, with `T=50`:
+
+
+```@example particle_filter_tutorial
+import Random
+Random.seed!(3)
+
+# generate trace with specific initial conditions
+T = 50
+constraints = Gen.choicemap((:x0, 0.01), (:y0, 0.95), (:vx0, 0.002), (:vy0, -0.013))
+(trace, _) = Gen.generate(model, (T,), constraints)
+nothing
+```
+
+Let's extract the observed data `zs` from the trace
+```@example particle_filter_tutorial
+choices = Gen.get_choices(trace)
+zs = Vector{Float64}(undef, T+1)
+zs[1] = choices[:z0]
+for t=1:T
+    zs[t+1] = choices[(:z, t)]
+end
+zs
+```
+
+We next write a visualization for full traces of this model. It shows the ship's
+positions (as dots) as well as the observed bearings (as fixed length line 
+segments from the origin):
+
+```@example particle_filter_tutorial
+function render(trace; show_data=true, max_T=get_args(trace)[1], overlay=false)
+    (T,) = Gen.get_args(trace)
+    (xs, ys) = Gen.get_retval(trace)
+    
+    zs = Vector{Float64}(undef, T+1)
+    zs[1] = trace[:z0]
+    for t=1:T
+        zs[t+1] = trace[(:z, t)]
+    end
+    
+    f = overlay ? scatter! : scatter
+    fig = f(xs[1:max_T+1], ys[1:max_T+1], msize=3, msw=1, label=nothing)
+    
+    if show_data
+        for z in zs[1:max_T+1]
+            dx = cos(z) * 0.5
+            dy = sin(z) * 0.5
+            plot!([0., dx], [0., dy], color="red", alpha=0.3, label=nothing)
+        end
+    end
+    
+    return fig
+end
+```
+
+We visualize the synthetic trace below:
+
+```@example particle_filter_tutorial
+render(trace)
+title!("Observed bearings (lines) and positions (dots)")
+```
+
+Note that these are the observed *bearings*, but we are not plotting the
+"ground truth" *locations* of the ship. There are many trajectories consistent
+with these bearings; for each of the red rays in the above plot, the ship
+could be anywhere along the ray (or even slightly off it, given that our
+measurements are noisy). However, our assumptions about the dynamics of the
+situation &mdash; that is, the conditional distributions $P(x_{t+1}, y_{t+1}
+\mid x_t, y_t)$ &mdash; will ensure that physics-defying trajectories (e.g.,
+where the ship moves from a very high Y coordinate to a very low one in a
+short time) are ruled out.
+
+## Implementing a Basic Particle Filter
+
+In Gen, a **particle is represented as a trace** and the particle filter
+state contains a weighted collection of traces. Below we define an inference
+program that runs a particle filter on an observed data set of bearings
+(`zs`). We use `num_particles` particles internally, and then we return a
+sample of `num_samples` traces from the weighted collection that the particle
+filter produces.
+
+Gen provides methods for initializing and updating the state of a particle
+filter, documented in [Particle Filtering](https://www.gen.dev/docs/dev/ref/pf/).
+
+- `Gen.initialize_particle_filter`
+
+- `Gen.particle_filter_step!`
+
+Both of these methods can used either with the default proposal or a custom
+proposal. In this
+problem,
+we will use the default proposal. There is also a method that resamples
+particles based on their weights, which serves to redistribute the particles
+to more promising parts of the latent space.
+
+- `Gen.maybe_resample!`
+
+Gen also provides a method for sampling a collection of unweighted traces
+from the current weighted collection in the particle filter state:
+
+- `Gen.sample_unweighted_traces`
+
+```@example particle_filter_tutorial
+function particle_filter(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+
+    # construct initial observations
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+
+    # steps
+    for t=1:length(zs)-1
+        Gen.maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+```
+
+The initial state is obtained by providing the following to
+`initialize_particle_filter`:
+
+- The generative function for the generative model (`model`)
+
+- The initial arguments to the generative function.
+
+- The initial observations, expressed as a map from choice address to values
+  (`init_obs`).
+
+- The number of particles.
+
+At each step, we resample from the collection of traces (`maybe_resample!`)
+and then we introduce one additional bearing measurement by calling
+`particle_filter_step!` on the state. We pass the following arguments to
+`particle_filter_step!`:
+
+- The state (it will be mutated)
+
+- The new arguments to the generative function for this step. In our case,
+  this is the number of measurements beyond the first measurement.
+
+- The [argdiff](https://www.gen.dev/docs/dev/ref/gfi/#Argdiffs-1)
+  value, which provides detailed information about the change to the
+  arguments between the previous step and this step. We will revisit this
+  value later.  For now, we indicate that we do not know how the `T::Int`
+  argument will change with each step.
+
+- The new observations associated with the new step. In our case, this just
+  contains the latest measurement.
+
+
+We run this particle filter with 5000 particles, and return a sample of 200
+particles. This will take 30-60 seconds. We will see one way of speeding up
+the particle filter in a later section.
+
+```@example particle_filter_tutorial
+@time pf_traces = particle_filter(5000, zs, 200);
+```
+
+To render these traces, we first define a function that overlays many
+renderings:
+
+```@example particle_filter_tutorial
+function overlay(renderer, traces; same_data=true, args...)
+    fig = plot(xlabel="X", ylabel="Y",
+        title="Observed bearings (red) and \npositions of individual traces (one color per trace)")
+    
+    renderer(traces[1], show_data=true, overlay=true, args...)
+    for i=2:length(traces)
+        renderer(traces[i], show_data=!same_data, overlay=true, args...)
+    end
+    fig
+end
+```
+
+
+We then render the traces from the particle filter:
+
+```@example particle_filter_tutorial
+overlay(render, pf_traces)
+```
+
+We see a broad posterior; many trajectories (i.e. x- and y-positions) explain
+the observed bearings.
+
+Notice that as during the period of denser bearing measurements, the
+trajectories tend to turn so that the heading is more parallel to the bearing
+vector. An alternative explanation is that the point maintained a constant
+heading, but just slowed down significantly. It is interesting to see that the
+inferences favor the "turning explanation" over the "slowing down
+explanation".
+
+!!! note
+    Run the particle filter with fewer particles and visualize the results.
+
+
+    Run the particle filter without the `maybe_resample!` step, and visualize the
+    results.  What do you observe? Why do you think this is? Answer in the free
+    response section below.
+
+
+The code for particle_filter (from above) is copied in the body of the function below. Modify it so that it does NOT perform resampling after each
+time step.
+
+```@example particle_filter_tutorial
+function particle_filter_no_resampling(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+
+    # construct initial observations
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+
+    # steps
+    for t=1:length(zs)-1
+        Gen.maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+
+@time pf_traces_no_resampling = particle_filter_no_resampling(5000, zs, 200);
+overlay(render, pf_traces_no_resampling)
+```
+
+!!! note
+    Describe how the inference differs with and without resampling (based on the two plots above). Why do you think that is? Is it desirable?
+
+```@example particle_filter_tutorial
+function particle_filter_no_resampling(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+
+    # construct initial observations
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+
+    # steps
+    for t=1:length(zs)-1
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+```
+
+
+
+Without resampling, we get particle collapse, because the model converges on
+one best guess. This is not desirable; we have lost diversity in our
+samples.
+
+END ANSWER KEY
+
+## Adding Rejuvenation Moves
+
+The particle filter we developed above works as follows:
+
+* At the start, guess many possible initial positions and velocities for the
+  ship.
+* Score these proposals based on the initial observation, `z0`. 
+* Use `maybe_resample!` to clone the guesses that explain `z0` well, and cull
+  the guesses that explain it poorly.
+* For each data point:
+    1. For each guess (particle) from the previous time step, guess many possible 
+       _extensions_ of the particle to include values of `vx` and `vy` for the next 
+       time step.
+    2. Score these extended proposed particles based on the latest bearing.
+    3. Use `maybe_resample!` to clone the guesses that explain the `z`'s so far, and
+       cull the guesses that don't.
+
+A problem with this procedure is that after the initial guesses for a quantity
+have been made, they are never revised. This is despite the fact that learning
+about later bearings may tell us a lot about earlier positions. This can be
+especially problematic in the presence of *resampling*: notice how, in the
+above results, the starting locations of all the particles are likely nearly
+identical, even though the paths become more diverse as time goes on. This is
+because "good" particles at the first step were likely cloned and propagated
+through the particle filter, never changing the `x0` and `y0` values.
+
+Therefore, it is sometimes useful to add MCMC moves to particles in a particle
+filter between steps. These MCMC moves are often called "rejuvenation moves"
+[^4].  Each rejuvenation move targets the *current posterior distribution* at
+the given step. For example, when applying the rejuvenation move after
+incorporating 3 observations, our rejuvenation moves have as their stationary
+distribution the conditional distribution on the latent variables, given the
+first three observations.
+
+Rejuvenation moves can target any portion of the latent space. It is common
+for rejuvenation moves to target "global" variables that affect every time
+step (e.g., the initial position of the ship), or a sliding window of _recent_
+variables, e.g., the velocities from the previous five time steps. 
+
+In this section, we write two new versions of the particle filter, each of
+which uses Metropolis-Hastings rejuvenation moves to adjust each particle at
+every time step.  The first version uses so-called "resimulation MH" to adjust
+the initial choices (`x0`, `y0`, and the initial velocities). This means that
+the proposal distribution for MH is equal to the prior of the generative
+model.  The proposed next state under this rejuvenation move is independent of
+the current state.  By contrast, the second version we write will use Gaussian
+drift proposals, and therefore we refer to it as "random walk MH." The
+Gaussian drift rejuvenation moves will target a sliding window of recent
+velocities, perturbing them to see if &mdash; in light of new data &mdash; we
+can find better values for them.
+
+First, the resimulation MH rejuvenation move (this function is the same as the
+previous, but with the addition of a rejuvenation move targeting the initial
+choices of each particle):
+
+```@example particle_filter_tutorial
+function particle_filter_rejuv_resim(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+    for t=1:length(zs)-1
+
+        # apply a rejuvenation move to each particle
+        for i=1:num_particles
+            initial_choices = select(:x0, :y0, :vx0, :vy0)
+            state.traces[i], _  = mh(state.traces[i], initial_choices)
+        end
+
+        Gen.maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+
+@time pf_rejuv_resim_traces = particle_filter_rejuv_resim(5000, zs, 200);
+overlay(render, pf_rejuv_resim_traces)
+title!("Rejuvenation with resimulation MH on the starting points")
+```
+
+You may notice slightly more variety in the initial state, compared to our first round of particle filtering.
+
+!!! note
+
+    Write a random walk MH rejuvenation move that perturbs the velocity vectors
+    for a block of time steps between `a` and `b` inclusive. In this move, draw
+    the perturbation from a normal distribution with standard deviation `1e-3`.
+    When sampling a new `vx` and `vy` for time step `t` (where `a <= t <= b`),
+    make sure you use the right _address_ --- **you want to use the same address
+    in your proposal as was used in the model.**
+
+    We have provided starter code.
+
+```@example particle_filter_tutorial
+@gen function perturbation_proposal(prev_trace, a::Int, b::Int)
+    (T,) = get_args(prev_trace)
+    speed = Array{Float64}(undef, 2, 1)
+    for t=a:b
+        speed[1] = 0. # <Remove zero and put your code to perturb vx here>
+        speed[2] = 0. # <Remove zero and put your code to perturb vy here>
+    end
+    return speed # Return an array of the most recent [vx, vy] for testing
+end
+
+function perturbation_move(trace, a::Int, b::Int)
+    Gen.metropolis_hastings(trace, perturbation_proposal, (a, b))
+end;
+```
+We add this into our particle filtering inference program below. We apply the
+rejuvenation move to adjust the velocities for the previous 5 time steps.
+
+```@example particle_filter_tutorial
+function particle_filter_rejuv(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+    init_obs = Gen.choicemap((:z0, zs[1]))    
+    state = Gen.initialize_particle_filter(model, (0,), init_obs, num_particles)
+    for t=1:length(zs)-1
+
+        # apply a rejuvenation move to each particle
+        for i=1:num_particles
+            state.traces[i], _ = perturbation_move(state.traces[i], max(1, t-5), t-1)
+        end
+
+        Gen.maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap(((:z, t), zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of unweighted traces from the weighted collection
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+```
+
+We run the particle filter with rejuvenation below. This will take a minute
+or two. We will see one way of speeding up the particle filter in a later
+section.
+
+```@example particle_filter_tutorial
+@time pf_rejuv_traces = particle_filter_rejuv(5000, zs, 200);
+```
+
+We render the traces:
+
+```@example particle_filter_tutorial
+overlay(render, pf_rejuv_traces)
+title!("Rejuvenation with resimulation MH on the starting points")
+```
+
+<!-- 
+@gen function perturbation_proposal(prev_trace, a::Int, b::Int)
+    choices = get_choices(prev_trace)
+    (T,) = get_args(prev_trace)
+    speed = Array{Float64}(undef, 2, 1)
+    for t=a:b
+        speed[1] = {(:vx, t)} ~ normal(choices[(:vx, t)], 1e-3)
+        speed[2] = {(:vy, t)} ~ normal(choices[(:vy, t)], 1e-3)
+    end
+    return speed
+end
+
+function perturbation_move(trace, a::Int, b::Int)
+    Gen.metropolis_hastings(trace, perturbation_proposal, (a, b))
+end; 
+-->
+
+
+## Speeding Up Inference Using the `Unfold` Combinator
+For the particle filtering algorithms above, within an update step it is only
+necessary to revisit the most recent state (or the most recent 5 states if
+the rejuvenation moves are used) because the initial states are never
+updated, and the contribution of these states to the weight computation
+cancel.
+
+However, each update step of the particle filter inference programs above
+scales *linearly* in the size of the trace because it visits every state when
+computing the weight update. This is because the built-in modeling DSL by
+default always performs an end-to-end execution of the generative function
+body whenever performing a trace update. This allows the built-in modeling
+DSL to be very flexible and to have a simple implementation, at the cost of
+performance. There are several ways of improving performance after one has a
+prototype written in the built-in modeling DSL. One of these is [Generative
+Function Combinators](https://www.gen.dev/docs/dev/ref/combinators/), which make 
+the flow of information through the generative process more explicit to Gen, 
+and enable asymptotically more efficient inference programs.
+
+To exploit the opportunity for incremental computation, and improve the
+scaling behavior of our particle filter inference programs, we will write a
+new model using a generative function combinator to replaces the following 
+Julia `for` loop in our model.
+
+```julia
+    # generate successive states and measurements
+    for t=1:T
+
+        # update the state of the point
+        vx = {(:vx, t)} ~ normal(vx, sqrt(velocity_var))
+        vy = {(:vy, t)} ~ normal(vy, sqrt(velocity_var))
+        x += vx
+        y += vy
+
+        # bearing measurement
+        {(:z, t)} ~ normal(bearing(x, y), measurement_noise)
+
+        # record position
+        xs[t+1] = x
+        ys[t+1] = y
+    end
+```
+
+This `for` loop has a very specific pattern of information flow&mdash;there is a
+sequence of states (represented by `x`, `y`, `vx`, and `vy`), and each state is
+generated from the previous state. This is exactly the pattern that the
+[Unfold](https://www.gen.dev/docs/dev/ref/combinators/#Unfold-combinator-1)
+generative function combinator is designed to handle.
+
+Below, we re-express the Julia `for` loop over the state sequence using the
+Unfold combinator. Specifically, we define a generative function (`kernel`)
+that takes the prevous state as its second argument, and returns the new
+state. The Unfold combinator takes the kernel and returns a new generative
+function (`chain`) that applies kernel repeatedly. Read the Unfold combinator
+documentation for details on the behavior of the resulting generative
+function (`chain`).
+
+
+```@example particle_filter_tutorial
+struct State
+    x::Float64
+    y::Float64
+    vx::Float64
+    vy::Float64
+end
+
+@gen (static) function kernel(t::Int, prev_state::State,
+                              velocity_var::Float64, measurement_noise::Float64)
+    vx ~ normal(prev_state.vx, sqrt(velocity_var))
+    vy ~ normal(prev_state.vy, sqrt(velocity_var))
+    x = prev_state.x + vx
+    y = prev_state.y + vy
+    z ~ normal(bearing(x, y), measurement_noise)
+    next_state = State(x, y, vx, vy)
+    return next_state
+end
+
+chain = Gen.Unfold(kernel)
+
+Gen.@load_generated_functions # To allow use of the generative function written in the static modeling language above.
+```
+
+We can understand the behavior of `chain` by getting a trace of it and printing the random choices:
+
+trace = Gen.simulate(chain, (4, State(0., 0., 0., 0.), 0.01, 0.01))
+Gen.get_choices(trace)
+
+We now write a new version of the generative model that invokes `chain` instead of using the Julia `for` loop:
+
+```@example particle_fitler_tutorial
+@gen (static) function unfold_model(T::Int)
+
+    # parameters
+    measurement_noise = 0.005
+    velocity_var = 1e-6
+
+    # initial conditions
+    x0  ~ normal(0.01, 0.01)
+    y0  ~ normal(0.95, 0.01)
+    vx0 ~ normal(0.002, 0.01)
+    vy0 ~ normal(-0.013, 0.01)
+
+    # initial measurement
+    z0 ~ normal(bearing(x0, y0), measurement_noise)
+
+    # record initial state
+    init_state = State(x0, y0, vx0, vy0)
+
+    # run `chain` function under address namespace `:chain`, producing a vector of states
+    chain ~ chain(T, init_state, velocity_var, measurement_noise)
+
+    result = (init_state, chain)
+    return result
+end
+
+Gen.@load_generated_functions
+```
+
+Let's generate a trace of this new model program to understand its structure:
+
+```@example particle_filter_tutorial
+(trace, _) = Gen.generate(unfold_model, (4,))
+Gen.get_choices(trace)
+```
+
+We can now run a particle filter on the Unfold model and see a speedup:
+
+```@example particle_filter_tutorial
+function unfold_particle_filter(num_particles::Int, zs::Vector{Float64}, num_samples::Int)
+    init_obs = Gen.choicemap((:z0, zs[1]))
+    state = Gen.initialize_particle_filter(unfold_model, (0,), init_obs, num_particles)
+    
+    for t=1:length(zs)-1
+        maybe_resample!(state, ess_threshold=num_particles/2)
+        obs = Gen.choicemap((:chain => t => :z, zs[t+1]))
+        Gen.particle_filter_step!(state, (t,), (UnknownChange(),), obs)
+    end
+
+    # return a sample of traces from the weighted collection:
+    return Gen.sample_unweighted_traces(state, num_samples)
+end
+
+@time unfold_pf_traces = unfold_particle_filter(5000, zs, 200);
+
+function unfold_render(trace; show_data=true, max_T=get_args(trace)[1], overlay=false)
+    (T,) = Gen.get_args(trace)
+    choices = Gen.get_choices(trace)
+    (init_state, states) = Gen.get_retval(trace)
+    xs = Vector{Float64}(undef, T+1)
+    ys = Vector{Float64}(undef, T+1)
+    zs = Vector{Float64}(undef, T+1)
+    xs[1] = init_state.x
+    ys[1] = init_state.y
+    zs[1] = choices[:z0]
+    for t=1:T
+        xs[t+1] = states[t].x
+        ys[t+1] = states[t].y
+        zs[t+1] = choices[:chain => t => :z]
+    end
+    f = overlay ? scatter! : scatter
+    fig = f(xs[1:max_T+1], ys[1:max_T+1], msize=3, msw=1, label=nothing)
+    if show_data
+        for z in zs[1:max_T+1]
+            dx = cos(z) * 0.5
+            dy = sin(z) * 0.5
+            plot!([0., dx], [0., dy], color="red", alpha=0.3, label=nothing)
+        end
+    end
+end
+```
+
+Let's check that the results are reasonable:
+
+```@example particle_filter_tutorial
+overlay(unfold_render, unfold_pf_traces, same_data=true)
+```
+
+We now empirically investigate the scaling behavior of 
+1. the inference program that uses the Julia `for` loop (`particle_filter`),
+and 
+2. the equivalent inference program that uses `Unfold`
+(`unfold_particle_filter`). 
+
+We will use a synthetic long vector of z data, and we will investigate how the running time depends on the number of observations.
+
+```@example particle_filter_tutorial
+fake_zs = rand(1000);
+
+function timing_experiment(num_observations_list::Vector{Int}, num_particles::Int, num_samples::Int)
+    times = Vector{Float64}()
+    times_unfold = Vector{Float64}()
+    for num_observations in num_observations_list
+        println("evaluating inference programs for num_observations: $num_observations")
+        tstart = time_ns()
+        traces = particle_filter(num_particles, fake_zs[1:num_observations], num_samples)
+        push!(times, (time_ns() - tstart) / 1e9)
+        
+        tstart = time_ns()
+        traces = unfold_particle_filter(num_particles, fake_zs[1:num_observations], num_samples)
+        push!(times_unfold, (time_ns() - tstart) / 1e9)
+        
+    end
+    (times, times_unfold)
+end;
+
+num_observations_list = [1, 3, 10, 30, 50, 100, 150, 200, 500]
+(times, times_unfold) = timing_experiment(num_observations_list, 100, 20);
+```
+
+Notice that the running time of the inference program without unfold appears to be quadratic in the number of observations, whereas the inference program that uses unfold appears to scale linearly:
+
+```@example particle_filter_tutorial
+plot(num_observations_list, times, color="blue", 
+    xlabel="# observations", ylabel="running time (sec.)", label="for loop")
+plot!(num_observations_list, times_unfold, color="red", label="unfold")
+```
\ No newline at end of file
diff --git a/docs/src/ref/vi.md b/docs/src/tutorials/basics/vi.md
similarity index 65%
rename from docs/src/ref/vi.md
rename to docs/src/tutorials/basics/vi.md
index 3b638449d..ccc45158d 100644
--- a/docs/src/ref/vi.md
+++ b/docs/src/tutorials/basics/vi.md
@@ -1,18 +1,10 @@
-# Variational Inference
+# [Variational Inference] (@id vi_tutorial)
 
 Variational inference involves optimizing the parameters of a variational family to maximize a lower bound on the marginal likelihood called the ELBO.
 In Gen, variational families are represented as generative functions, and variational inference typically involves optimizing the trainable parameters of generative functions.
 
-## Black box variational inference
-There are two procedures in the inference library for performing black box variational inference.
-Each of these procedures can also train the model using stochastic gradient descent, as in a variational autoencoder.
-```@docs
-black_box_vi!
-black_box_vimco!
-```
-
 ## Reparametrization trick
 
 To use the reparametrization trick to reduce the variance of gradient estimators, users currently need to write two versions of their variational family, one that is reparametrized and one that is not.
 Gen does not currently include inference library support for this.
-We plan add add automated support for reparametrization and other variance reduction techniques in the future.
+We plan add add automated support for reparametrization and other variance reduction techniques in the future.
\ No newline at end of file
diff --git a/docs/src/tutorials/data_driven_proposals.md b/docs/src/tutorials/data_driven_proposals.md
new file mode 100644
index 000000000..c346a83ea
--- /dev/null
+++ b/docs/src/tutorials/data_driven_proposals.md
@@ -0,0 +1,1201 @@
+# [Improving Inference using Data-Driven Custom Proposals _(with applications to Inverse Planning)_](@id custom_proposal_tutorial)
+
+In our [introduction to modeling tutorial](@ref introduction_to_modeling_in_gen), we used **importance sampling** for inference: the inference engine _proposed_ many possible explanations for a dataset, and then chose one. Importance sampling can be difficult to scale to more complex problems, because it is essentially "guessing and checking." If we ran importance sampling with 1000 particles, for example, the method would fail unless those 1000 proposed solutions (blind guesses, essentially) contained _something_ close to the true answer. In complex problems, it is difficult
+to "guess" (or "propose") an entire solution all at once.
+
+This tutorial addresses the scalability problem while staying inside
+the importance sampling framework: the inference in this notebook is all importance
+sampling, but with **customized "data-driven" proposals**. Such proposals 
+can be used to accelerate Monte Carlo inference, making importance sampling
+feasible for a broader class of models. Data-driven proposals work by incorporating 
+information from the observed data set to make better proposals for the 
+latent variables in a generative model. Data-driven proposals can be based on heuristics,
+or general-purpose _discriminative_ models, such as neural networks or random forests.
+Many data-driven proposals have **trainable parameters**, which can be learned via gradient
+descent using synthetic data simulated from the generative model itself. This training
+process is sometimes called 'amortized inference' or 'inference compilation'.
+
+Although we focus on using data-driven proposals with importance sampling in this notebook,
+data-driven proposals can also be used with Markov Chain Monte Carlo (MCMC) and
+[sequential Monte Carlo](@ref object_tracking_with_particle_filter_tutorial) (SMC), covered in [other](Iterative%20Inference%20in%20Gen.jl).
+
+### Application to Inverse Planning
+This notebook begins by introducing a probabilistic model for the motion of 
+an autonomous agent. The model itself demonstrates an important feature of
+Gen: because it is embedded in Julia, we can use complex, black-box programs
+as sub-routines in our models. The model we develop here uses an _RRT path planner_
+to model the goal-directed motion of the agent.
+
+After developing the model, we set out to improve the efficiency of inference. 
+We show that we can improve the efficiency of inference in this model using
+a custom proposal for the destination of the agent.
+
+*Outline*
+
+**Section 1.** [A generative model of an autonomous agent](#model)
+
+**Section 2.** [Writing a data-driven proposal as a generative function](#custom-proposal)
+
+**Section 3.** [Using a data-driven proposal within importance sampling](#using)
+
+**Section 4.** [Training the parameters of a data-driven proposal](#training)
+
+
+```
+using Gen, Distributions
+```
+
+## 1: A generative model of an autonomous agent   <a name="model"></a>
+
+We begin by writing a generative probabilistic model of the motion of an
+intelligent agent that is navigating a two-dimensional scene. The model will
+be *algorithmic* --- it will invoke a path planning algorithm implemented in
+regular Julia code to generate the agent's motion plan from its destination
+and its map of the scene.
+
+First, we load some basic geometric primitives for a two-dimensional scene. We
+implemented these already in an auxiliary Julia file:
+
+```
+include("../inverse-planning/geometric_primitives.jl");
+```
+
+The file we loaded gives us the `Point` data type (imported from [Luxor.jl](https://juliagraphics.github.io/Luxor.jl/)), which has fields `x` and `y`.
+
+```
+point = Point(1.0, 2.0)
+println(point.x)
+println(point.y)
+```
+
+The file also defines an `Obstacle` data type, which represents a polygonal
+obstacle in a two-dimensional scene, that is constructed from a list of
+vertices. Here, we construct a square:
+
+```
+obstacle = Obstacle([Point(0.0, 0.0), Point(0.5, 0.0), Point(0.5, 1.0), Point(0.0, 1.0)])
+```
+
+Next, we load the definition of a `Scene` data type that represents a
+two-dimensional scene.
+
+include("../inverse-planning/scene.jl");
+
+The `Scene` data type represents a rectangle on the two-dimensional x-y plane
+with a list of obstacles, which is initially empty.
+
+```
+scene = Scene(xmin=0, xmax=1, ymin=0, ymax=1)
+```
+
+Obstacles can be added to a scene with the `add_obstacle!` function:
+
+```
+add_obstacle!(scene, obstacle);
+```
+
+The file we imported also defines functions that simplify the construction of obstacles:
+
+`make_square(center::Point, size::Float64)` constructs a square-shaped
+obstacle centered at the given point with the given side length:
+
+```
+square_obstacle = make_square(Point(.25, .75), 0.1)
+```
+
+`make_line(vertical::Bool, start::Point, length::Float64,
+thickness::Float64)` constructs an axis-aligned line (either vertical or
+horizontal) with given thickness that extends from a given strating point for
+a certain length:
+
+```
+vertical_wall_obstacle = make_line(false, Point(0.20, 0.40), 0.40, 0.02)
+```
+
+We now construct a scene value that we will use in the rest of the notebook:
+
+scene = Scene(xmin=0, xmax=1, ymin=0, ymax=1)
+add_obstacle!(scene, make_square(Point(0.30, 0.20), 0.1))
+add_obstacle!(scene, make_square(Point(0.83, 0.80), 0.1))
+add_obstacle!(scene, make_square(Point(0.80, 0.40), 0.1))
+horizontal = false
+vertical = true
+wall_thickness = 0.02
+add_obstacle!(scene, make_line(horizontal, Point(0.20, 0.40), 0.40, wall_thickness))
+add_obstacle!(scene, make_line(vertical, Point(0.60, 0.40), 0.40, wall_thickness))
+add_obstacle!(scene, make_line(horizontal, Point(0.60 - 0.15, 0.80), 0.15 + wall_thickness, wall_thickness))
+add_obstacle!(scene, make_line(horizontal, Point(0.20, 0.80), 0.15, wall_thickness))
+add_obstacle!(scene, make_line(vertical, Point(0.20, 0.40), 0.40, wall_thickness));
+
+# We visualize the scene below.
+include("../inverse-planning/viz.jl")
+visualize() do
+    draw_scene(scene)
+end
+
+Next, we load a file that defines a `Path` data type (a sequence of
+`Point`s), and a `plan_path` method, which  uses a path planning algorithm
+based on rapidly exploring random tree (RRT, [1]) to find a sequence of
+`Point`s beginning with `start` and ending in `dest` such that the line
+segment between each consecutive pair of points does not intersect any
+obstacles in the scene. The planning algorithm may fail to find a valid path,
+in which case it will return a value of type `Nothing`.
+
+`path::Union{Path,Nothing} = plan_path(start::Point, dest::Point,
+scene::Scene, planner_params::PlannerParams)`
+
+[1] [_Rapidly-exploring random trees: A new tool for path planning._](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.35.1853&rep=rep1&type=pdf)
+# S. M. LaValle. TR 98-11, Computer Science Dept., Iowa State University, October 1998.
+
+include("../inverse-planning/planning.jl");
+
+Let's use `plan_path` to plan a path from the lower-left corner of the scene
+into the interior of the box.
+
+start = Point(0.1, 0.1)
+dest = Point(0.5, 0.5)
+planner_params = PlannerParams(rrt_iters=600, rrt_dt=3.0,
+                               refine_iters=3500, refine_std=1.)
+example_path = plan_path(start, dest, scene, planner_params)
+
+We visualize the path below with the function `visualize` (defined in the
+external file we loaded), which will visualize the path in the scene.
+The start location is shown as a blue circle, the
+destination as a red rhombus, and the path in orange. Run the cell above followed by
+the cell below a few times to see the variability in the paths generated by
+`plan_path` for these inputs.
+
+visualize() do
+    draw_trace(Dict(:start => start,
+                    :dest => dest,
+                    :scene => scene,
+                    :path => example_path.points);
+               should_draw_measurements=false)
+end
+
+We also need a model for how the agent moves along its path.
+We will assume that the agent moves along its path a constant speed. The file
+loaded above also defines a method `walk_path(path, speed, dt, num_ticks)` 
+that computes the locations of the agent at a set of timepoints 
+(a vector of `Point`s sampled at time intervals of `dt` starting at time `0.`),
+given the path and the speed of the agent:
+
+speed = 1.
+dt = 0.1
+num_ticks = 10;
+example_locations = walk_path(example_path, speed, dt, num_ticks)
+println(example_locations)
+
+# Now, we are prepated to write our generative model for the motion of the agent.
+
+@gen function agent_model(
+        scene::Scene, dt::Float64, num_ticks::Int, 
+        planner_params::PlannerParams)
+
+    # sample the start point of the agent from the prior
+    start_x ~ uniform(0, 1)
+    start_y ~ uniform(0, 1)
+    start = Point(start_x, start_y)
+
+    # sample the destination point of the agent from the prior
+    dest_x ~ uniform(0, 1)
+    dest_y ~ uniform(0, 1)
+    dest = Point(dest_x, dest_y)
+
+    # plan a path that avoids obstacles in the scene
+    maybe_path = plan_path(start, dest, scene, planner_params)
+    planning_failed = maybe_path === nothing
+    
+    # sample the speed from the prior
+    speed ~ uniform(0.3, 1)
+
+    if planning_failed   
+        # path planning failed; assume agent stays at start location indefinitely
+        locations = fill(start, num_ticks)
+    else   
+        # path planning succeeded; move along the path at constant speed
+        locations = walk_path(maybe_path, speed, dt, num_ticks)
+    end
+
+    # generate noisy measurements of the agent's location at each time point
+    noise = 0.01
+    for (i, point) in enumerate(locations)
+        x = {:meas => (i, :x)} ~ normal(point.x, noise)
+        y = {:meas => (i, :y)} ~ normal(point.y, noise)
+    end
+
+    return (planning_failed, maybe_path)
+end;
+
+We can now perform a traced execution of `agent_model` and examine the
+random choices it made:
+
+### Exercise
+
+Using `simulate` (or `generate`, without any constraints) to sample a trace,
+get the random choices made by this model. Parameterize the planner using
+`PlannerParams` with the same parameters as above.
+
+<!-- # BEGIN ANSWER KEY 2A.1
+
+planner_params = PlannerParams(rrt_iters=600, rrt_dt=3.0,
+                               refine_iters=3500, refine_std=1.)
+trace = Gen.simulate(agent_model, (scene, dt, num_ticks, planner_params));
+choices = Gen.get_choices(trace)
+display(choices)
+
+# END ANSWER KEY -->
+<hr>
+
+Next we explore the assumptions of the model by sampling many traces from the
+generative function and visualizing them. We have created a visualization
+specialized for this generative function in the file we included above.
+It also defined a `visualize_grid` method to plot traces in a grid.
+
++
+Let's visualize several traces of the function, with the start location fixed to
+a point in the lower-left corner:
+constraints = Gen.choicemap()
+constraints[:start_x] = 0.1
+constraints[:start_y] = 0.1
+
+traces = [Gen.generate(
+    agent_model, (scene, dt, num_ticks, planner_params), constraints
+    )[1] for i in 1:12];
+visualize_grid(traces, 4, 600; separators="gray") do trace, frame
+    draw_trace(trace, frame; draw_measurements=true, markersize=6)
+end
+-
+
+In this visualization, the measured coordinates at each
+time point are represented by black $\times$ marks. The path, if path planning was
+succesful, is shown as an orange line from the start point to the destination
+point. Notice that the speed of the agent is different in each case. Also note that
+the we observe the agent for a fixed amount of time, in which they may or may not
+finish walking their planned path.
+
+<hr>
+
+### Exercise
+
+Edit the constraints passed to the inference algorithm:
+
+1. Constrain the start of the agent to be at $x = 0.9$, $y = 0.1$.
+2. Constrain the destination of the agent to be at $x = 0.9$, $y = 0.8$.
+
+
+Visualize the resulting prior. We have provided some starter code.
+
++
+< put your code here>
+traces_constrained = []
+for i in 1:12
+    # Modify the following line:
+    (trace_constrained, _) = Gen.generate(agent_model, (scene, dt, num_ticks, planner_params))
+    push!(traces_constrained, trace_constrained)
+end
+
+# Visualize:
+visualize_grid(traces_constrained, 4, 600; separators="gray") do trace, frame
+    draw_trace(trace, frame; draw_measurements=true)
+end
+-
+
+<!-- # BEGIN ANSWER KEY 2A.2
+
+constraints = Gen.choicemap()
+constraints[:start_x] = 0.9
+constraints[:start_y] = 0.1
+
+constraints[:dest_x] = 0.9
+constraints[:dest_y] = 0.8
+
+traces_constrained = []
+for i in 1:12
+    # Modify the following line:
+    (trace_constrained, _) = Gen.generate(agent_model, (scene, dt, num_ticks, planner_params), constraints)
+    push!(traces_constrained, trace_constrained)
+end
+
+# Visualize:
+visualize_grid(traces_constrained, 4, 600; separators="gray") do trace, frame
+    draw_trace(trace, frame; draw_measurements=true)
+end
+
+# END ANSWER KEY -->
+<hr>
+
+### Exercise
+The `rrt_iters` field of `PlannerParams` is the number of iterations of the RRT
+algorithm to use. The `refine_iters` field of `PlannerParams` is the number of
+iterations of path refinement. These parameters affect the distribution on
+paths of the agent. Visualize traces of the `agent_model` with a couple of
+different settings of these two parameters to the path planning algorithm for
+fixed starting point and destination point. Try setting them to smaller
+values. Discuss.
+
+We have provided starter code.
+
+constraints = Gen.choicemap()
+constraints[:start_x] = 0.1
+constraints[:start_y] = 0.1;
+
+Modify the `PlannerParams` in the cell below.
+
+# +
+planner_params = PlannerParams(
+    rrt_iters=300, rrt_dt=3.0, refine_iters=2000, refine_std=1.) # < change this line>
+
+traces = [Gen.generate(agent_model, (scene, dt, num_ticks, planner_params), constraints)[1] for i in 1:12];
+visualize_grid(traces, 4, 600; separators="gray") do trace, frame 
+    draw_trace(trace, frame; draw_measurements=true)
+end
+-
+
+<hr>
+
+For the next few sections of the notebook, let's reset any variables that may have changed during your exploration with the model.
+
+start = Point(0.1, 0.1)
+dt = 0.1
+num_ticks = 10
+planner_params = PlannerParams(rrt_iters=600, rrt_dt=3.0,
+                               refine_iters=3500, refine_std=1.);
+
+We will infer the destination of the agent for the given sequence of observed locations:
+
+measurements = [
+    Point(0.0980245, 0.104775),
+    Point(0.113734, 0.150773),
+    Point(0.100412, 0.195499),
+    Point(0.114794, 0.237386),
+    Point(0.0957668, 0.277711),
+    Point(0.140181, 0.31304),
+    Point(0.124384, 0.356242),
+    Point(0.122272, 0.414463),
+    Point(0.124597, 0.462056),
+    Point(0.126227, 0.498338)];
+
+<hr>
+
+### Exercise
+
+Run inference using Gen's built-in importance resampling implementation. Use
+50 importance samples (`amt_computation`). 
+
+To see how to use the built-in importance resampling function, run
+```?Gen.importance_resampling``` or check out the
+[documentation](https://www.gen.dev/docs/dev/ref/importance/#Gen.importance_resampling).
+
+We have provided some starter code.
+
+function do_inference(
+        scene::Scene, dt::Float64, num_ticks::Int, 
+        planner_params::PlannerParams, 
+        start::Point, measurements::Vector{Point}, amount_of_computation::Int)
+    
+    # Constrain the observed measurements.
+    observations = Gen.choicemap()
+    observations[:start_x] = start.x
+    observations[:start_y] = start.y
+    for (i, m) in enumerate(measurements)
+        observations[:meas => (i, :x)] = m.x
+        observations[:meas => (i, :y)] = m.y
+    end
+    
+    # < put your code here>
+    
+    return trace
+end;
+
+<!-- # BEGIN ANSWER KEY 2A.3
+
+function do_inference(
+        scene::Scene, dt::Float64, num_ticks::Int, 
+        planner_params::PlannerParams, 
+        start::Point, measurements::Vector{Point}, amount_of_computation::Int)
+    
+    # Constrain the observed measurements.
+    observations = Gen.choicemap()
+    observations[:start_x] = start.x
+    observations[:start_y] = start.y
+    for (i, m) in enumerate(measurements)
+        observations[:meas => (i, :x)] = m.x
+        observations[:meas => (i, :y)] = m.y
+    end
+    
+    # Call importance_resampling to obtain a likely trace consistent
+    # with our observations.
+    (trace, _) = Gen.importance_resampling(
+        agent_model, (scene, dt, num_ticks, planner_params),
+        observations, amount_of_computation)
+    
+    return trace
+end;
+
+# END ANSWER KEY -->
+#### Visualize your answer
+Below, we run this algorithm 500 times, to generate 500 approximate samples
+from the posterior distribution on the destination. The inferred destinations
+should appear as red rhombuses on the map. The following function
+visualizes the paths overlaid on the scene.
+
+function visualize_inference(measurements, scene, start; computation_amt=50, samples=1000)
+    visualize() do
+        for i in 1:samples
+            trace = do_inference(scene, dt, num_ticks, planner_params, start, measurements, computation_amt)
+            draw_trace(trace; draw_measurements=true, draw_path=false)
+        end
+    end
+end;
+
+
+And now we run it! Note that this might take a while.
+
+visualize_inference(
+    measurements, scene, start, computation_amt=100, samples=500)
+
+The algorithm has made reasonable inferences about where the agent was likely
+trying to go.
+
+Note that the above illustration takes a while to produce. This is
+because computing each endpoint requires sampling 100 times from the default proposal (which
+runs the RRT planner). When our models contain more expensive components, like
+the path-planner, the computational demands of inference increase accordingly.
+This motivates us to find more efficient inference algorithms, that will
+require fewer model evaluations to produce good results.
+
+<hr>
+
+### Exercise
+
+In this problem, you'll explore the effect of changing the _scene_ on the
+inferences we make about the agent. Below, we've reproduced the code for
+constructing the scene in which we performed inference above. Modify the scene
+so that there is an opening into the "room" along the _bottom_ wall, in
+addition to the already-existing door along top wall. Otherwise, the scene
+should be identical to the one above.
+
+Rerun inference. The results should be qualitatively different from the
+results generated above, even though the observed movements of the agent are
+identical. **Write a one- or two-sentence description of how the results are
+different, and why.** Please address:
+
+1. Why would a _human_ make different inferences about the agent's likely
+   destination in the two different scenes?
+2. To what extent does the _model_ succeed in producing qualitiatively
+   different results in the two scenes? Why? (Concretely, why are certain
+   proposals more often rejected by importance sampling in the two-door scene
+   than in the one-door scene?)
+
+# +
+scene_2doors = Scene(xmin=0, xmax=1, ymin=0, ymax=1)
+
+Add the three blocks.
+```
+add_obstacle!(scene_2doors, make_square(Point(0.30, 0.20), 0.1))
+add_obstacle!(scene_2doors, make_square(Point(0.83, 0.80), 0.1))
+add_obstacle!(scene_2doors, make_square(Point(0.80, 0.40), 0.1))
+```
+
+Add the walls. You will need to change this code. In particular, you will need to edit 
+one of these lines (the one that constructs the bottom wall of the room) and add one new line
+(because in order to create the "door", you will actually need to represent the bottom wall
+as two separate rectangular obstacles -- as the sample code already does for the top wall).
+```
+horizontal = false
+vertical = true
+wall_thickness = 0.02
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.20, 0.40), 0.40, wall_thickness))
+add_obstacle!(scene_2doors, make_line(vertical, Point(0.60, 0.40), 0.40, wall_thickness))
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.60 - 0.15, 0.80), 0.15 + wall_thickness, wall_thickness))
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.20, 0.80), 0.15, wall_thickness))
+add_obstacle!(scene_2doors, make_line(vertical, Point(0.20, 0.40), 0.40, wall_thickness));
+# -
+```
+
+Perform and visualize inference:
+```
+visualize_inference(measurements, scene_2doors, start, computation_amt=100, samples=500)
+```
+
+**Free response:** What changed about the inferences when you changed the scene,
+and why? You might address:
+
+1. Why would a _human_ make different inferences about the agent's likely
+   destination in the two different scenes?
+2. To what extent does the _model_ succeed in producing qualitiatively
+   different results in the two scenes? (Concretely, why are certain proposals
+   more often rejected by importance sampling in the two-door scene than in
+   the one-door scene?)
+
+<hr>
+<!-- # BEGIN ANSWER KEY 2.4
+
+scene_2doors = Scene(xmin=0, xmax=1, ymin=0, ymax=1)
+
+# Add the three blocks.
+add_obstacle!(scene_2doors, make_square(Point(0.30, 0.20), 0.1))
+add_obstacle!(scene_2doors, make_square(Point(0.83, 0.80), 0.1))
+add_obstacle!(scene_2doors, make_square(Point(0.80, 0.40), 0.1))
+
+horizontal = false
+vertical = true
+wall_thickness = 0.02
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.60 - 0.15, 0.40), 0.15 + wall_thickness, wall_thickness))
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.20, 0.40), 0.15, wall_thickness))
+add_obstacle!(scene_2doors, make_line(vertical, Point(0.60, 0.40), 0.40, wall_thickness))
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.60 - 0.15, 0.80), 0.15 + wall_thickness, wall_thickness))
+add_obstacle!(scene_2doors, make_line(horizontal, Point(0.20, 0.80), 0.15, wall_thickness))
+add_obstacle!(scene_2doors, make_line(vertical, Point(0.20, 0.40), 0.40, wall_thickness))
+
+# Perform and visualize inference:
+visualize_inference(measurements, scene_2doors, start, computation_amt=50, samples=100)
+
+# END ANSWER KEY -->
+
+## 2. Writing a data-driven proposal as a generative function <a name="custom-proposal"></a>
+
+The inference algorithm above used a variant of
+[`Gen.importance_resampling`](https://www.gen.dev/docs/stable/ref/importance/#Gen.importance_resampling)
+that does not take a custom proposal distribution. It uses the default
+proposal distribution associated with the generative model. For generative
+functions defined using the built-in modeling DSL, the default proposal
+distribution is based on *ancestral sampling*, which involves sampling
+unconstrained random choices from the distributions specified in the
+generative model. Put more simply, each "guess" the inference algorithm
+makes about the possible destination of the agent is totally uninformed
+by the observed measurements; it is sampled using the prior generative
+ model's `dest_x` and `dest_y` sampling statements.
+
+We can visualize this default proposal distribution by sampling from it, 
+ using `Gen.generate` (note, we also could use `Gen.simulate` for the same purpose, since we are not passing any constraints). The cell below shows samples of the agent's destination 
+ drawn from this distribution.
+
++
+include("../inverse-planning/viz.jl");
+
+traces = [Gen.generate(agent_model, (scene, dt, num_ticks, planner_params))[1] for i in 1:1000]
+visualize() do
+
+    for i in 1:1000
+        trace, = Gen.generate(agent_model, (scene, dt, num_ticks, planner_params))
+        draw_dest(scene, Point(trace[:dest_x], trace[:dest_y]))
+    end
+
+    draw_scene(scene)
+    draw_start(scene, start)
+    draw_measurements(scene, measurements)
+end
+-
+
+Intuitively, if we see the data set above (where blue is the starting
+location, and the measurements are in black), we might guess that the 
+agent is more likely to be heading into the upper part of the scene. This 
+is because we don't expect the agent to unecessarily backtrack on its route
+to its destnation. A simple heuristic for biasing the proposal distribution 
+of the destination using just the first measurement and the last measurement might be:
+
+- If the x-coordinate of the last measurement is greater than the
+  x-coordinate of the first measurement, we think the agent is probably
+  headed generally to the right. Make values `:dest_x` that are greater than
+  the x-coordinate of the last measurement more probable.
+
+- If the x-coordinate of the last measurment is less than the x-coordinate of
+  the first measurement, we think the agent is probably headed generally to
+  the left. Make values  `:dest_x` that are smaller than the x-coordinate of
+  the last measurement more probable.
+
+We can apply the same heuristic separately for the y-coordinate.
+
+To implement this idea, we discretize the x-axis and y-axis of the scene into
+bins:
+
+num_x_bins = 5
+num_y_bins = 5;
+
+We will propose the x-coordinate of the destination from a
+[piecewise_uniform](https://www.gen.dev/docs/dev/ref/distributions/#Gen.piecewise_uniform)
+distribution, where we set higher probability for certain bins based on the
+heuristic described above and use a uniform continuous distribution for the
+coordinate within a bin. The `compute_bin_probs` function below computes the
+probability for each bin. The bounds of the scene are given by `min` and
+`max`. The coordinates of the first and last measured points respectively are
+given by `first` and `last`. We compute the probability by assigning a
+"score" to each bin based on the heuristic above --- if the bin should
+receive lower probability, it gets a score of 1., and if it should receive
+higher probability, it gets a bin of `score_high`, where `score_high` is some
+value greater than 1.
+
++
+function compute_bin_prob(first::Float64, last::Float64, bin::Int, last_bin::Int, score_high)
+    last >= first && bin >= last_bin && return score_high
+    last < first && bin <= last_bin && return score_high
+    return 1.
+end
+
+function compute_bin_probs(num_bins::Int, min::Float64, max::Float64, first::Float64, last::Float64, score_high)
+    bin_len = (max - min) / num_bins
+    last_bin = Int(floor(last / bin_len)) + 1
+    probs = [compute_bin_prob(first, last, bin, last_bin, score_high) for bin=1:num_bins]
+    total = sum(probs)
+    return [p / total for p in probs]
+end;
+-
+
+We will see how to automatically tune the value of `score_high` shortly. For
+now, we will use a value of 5. Below, we see that for the data set of
+measurements, shown above the probabilities of higher bins are indeed 5x
+larger than those of lower bins, becuase the agent seems to be headed up.
+
+compute_bin_probs(num_y_bins, scene.ymin, scene.ymax, measurements[1].y, measurements[end].y, 5.)
+
+Next, we write a generative function that applies this heuristic for both the
+x-coordinate and y-coordinate, and samples the destination coordinates at
+addresses `:dest_x` and `:dest_y`.
+
+@gen function custom_dest_proposal(measurements::Vector{Point}, scene::Scene)
+
+    score_high = 5.
+    
+    x_first = measurements[1].x
+    x_last = measurements[end].x
+    y_first = measurements[1].y
+    y_last = measurements[end].y
+    
+    # sample dest_x
+    x_probs = compute_bin_probs(num_x_bins, scene.xmin, scene.xmax, x_first, x_last, score_high)
+    x_bounds = collect(range(scene.xmin, stop=scene.xmax, length=num_x_bins+1))
+    dest_x ~ piecewise_uniform(x_bounds, x_probs)
+    
+    # sample dest_y
+    y_probs = compute_bin_probs(num_y_bins, scene.ymin, scene.ymax, y_first, y_last, score_high)
+    y_bounds = collect(range(scene.ymin, stop=scene.ymax, length=num_y_bins+1))
+    dest_y ~ piecewise_uniform(y_bounds, y_probs)
+    
+    return nothing
+end;
+
+We can propose values of random choices from the proposal function using
+[`Gen.propose`](https://www.gen.dev/docs/stable/ref/gfi/#Gen.propose).
+This method returns the choices, as well as some other information, which we
+won't need for our purposes. For now, you can think of `Gen.propose` as
+similar to `Gen.generate` except that it does not produce a full execution
+trace (only the choices), and it does not accept constraints. We can see the
+random choices for one run of the proposal on our data set:
+
+(proposed_choices, _, _) = Gen.propose(custom_dest_proposal, (measurements, scene))
+proposed_choices
+
+The function below runs the proposal 1000 times. For each run, it generates a
+trace of the model where the `:dest_x` and `:dest_y` choices are constrained
+to the proposed values, and then visualizes the resulting traces. We make the
+proposal a parameter of the function because we will be visualizing the
+output distribution of various proposals later in the notebook.
+
+function visualize_custom_destination_proposal(measurements, start, dest_proposal; num_samples=100)
+    visualize() do 
+        for i=1:num_samples
+            (proposed_choices, _) = Gen.propose(dest_proposal, (measurements, scene))
+            constraints = choicemap(proposed_choices)
+            constraints[:start_x] = start.x
+            constraints[:start_y] = start.y
+            (trace, _) = Gen.generate(agent_model, (scene, dt, num_ticks, planner_params), constraints)
+            draw_dest(scene, Point(trace[:dest_x], trace[:dest_y]))
+        end
+        draw_scene(scene)
+        draw_start(scene, start)
+        draw_measurements(scene, measurements)
+    end
+end;
+
+Let's visualize the output distribution of `custom_dest_proposal` for our
+data set:
+
+visualize_custom_destination_proposal(measurements, start, custom_dest_proposal, num_samples=1000)
+
+We see that the proposal distribution indeed samples destinations in the top
+half of the scene with higher probability than destinations in the bottom
+half.
+
+Alone, this is just a heuristic. But we can use it as a proposal for importance sampling, turning it into an asymptotically valid Bayesian inference algorithm. Alternatively, we can view it as a tool for speeding up our naive importance sampler, by focusing computation on regions of the space that are more likely.
+
+## 3. Using a data-driven proposal within importance sampling <a name="using"></a>
+
+We now use our data-driven proposal within an inference algorithm. There is a
+second variant of
+[`Gen.importance_resampling`](https://www.gen.dev/docs/stable/ref/importance/#Gen.importance_resampling)
+that accepts a generative function representing a custom proposal. This
+proposal generative function makes traced random choices at the addresses of
+a subset of the unobserved random choices made by the generative model. In
+our case, these addresses are `:dest_x` and `:dest_y`.
+
+<hr>
+
+### Exercise
+
+Implement an inference program that uses this second variant of importance resampling. 
+
+
+Because we will experiment with different data-driven proposals, we make the
+proposal into an agument of our inference program. We assume that the
+proposal accepts arguments `(measurements, scene)`.
+
+This time, use only 5 importance samples (`amt_computation`). You can run
+`?Gen.importance_resampling` or check out the
+[documentation](https://www.gen.dev/docs/stable/ref/inference/#Importance-Sampling-1)
+to understand how to supply the arguments to invoke this second version of of
+importance resampling.
+
+We have provided some starter code.
+
+# +
+function do_inference_data_driven(
+        dest_proposal::GenerativeFunction,
+        scene::Scene, dt::Float64,
+        num_ticks::Int, planner_params::PlannerParams,
+        start::Point, measurements::Vector{Point}, 
+        amount_of_computation::Int)
+    
+    observations = Gen.choicemap((:start_x, start.x), (:start_y, start.y))
+    for (i, m) in enumerate(measurements)
+        observations[:meas => (i, :x)] = m.x
+        observations[:meas => (i, :y)] = m.y
+    end
+    
+    # trace = ... < put your code here>
+    
+    return trace
+end;
+
+function visualize_data_driven_inference(measurements, scene, start, proposal; amt_computation=50, samples=1000)
+    visualize() do 
+      for i=1:samples
+          trace = do_inference_data_driven(proposal, 
+              scene, dt, num_ticks, planner_params, start, measurements, amt_computation)
+          draw_dest(scene, Point(trace[:dest_x], trace[:dest_y]))
+      end
+        draw_scene(scene)
+        draw_start(scene, start)
+        draw_measurements(scene, measurements)
+    end
+end;
+# -
+
+visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal; amt_computation=5, samples=1000)
+
+The code executes much more quickly than before, because we are only taking five proposal samples to generate each.
+
+We compare this to the original algorithm that used the default proposal, for
+the same "amount of computation" of 5.
+
+visualize_inference(measurements, scene, start, computation_amt=5, samples=1000)
+
+We should see that the results are somewhat more accurate using the
+data-driven proposal.  In particular, there is less probability mass in the
+lower left corner when using the data-driven proposal.
+
+<hr>
+
+<!-- # BEGIN ANSWER KEY 2A.5
+
+function do_inference_data_driven(
+        dest_proposal::GenerativeFunction,
+        scene::Scene, dt::Float64,
+        num_ticks::Int, planner_params::PlannerParams,
+        start::Point, measurements::Vector{Point}, 
+        amount_of_computation::Int)
+    
+    observations = Gen.choicemap((:start_x, start.x), (:start_y, start.y))
+    for (i, m) in enumerate(measurements)
+        observations[:meas => (i, :x)] = m.x
+        observations[:meas => (i, :y)] = m.y
+    end
+    
+    # invoke the variant of importance_resampling 
+    # that accepts a custom proposal (dest_proposal).
+    # the arguments to the custom proposal are (measurements, scene)
+    (trace, _) = Gen.importance_resampling(agent_model, (scene, dt, num_ticks, planner_params), observations, 
+        dest_proposal, (measurements, scene), amount_of_computation)
+    
+    return trace
+end;
+
+function visualize_data_driven_inference(measurements, scene, start, proposal; amt_computation=50, samples=1000)
+    visualize() do 
+      for i=1:samples
+          trace = do_inference_data_driven(proposal, 
+              scene, dt, num_ticks, planner_params, start, measurements, amt_computation)
+          draw_dest(scene, Point(trace[:dest_x], trace[:dest_y]))
+      end
+        draw_scene(scene)
+        draw_start(scene, start)
+        draw_measurements(scene, measurements)
+    end
+end;
+
+# END ANSWER KEY -->
+
+## 4. Training the parameters of a data-driven proposal <a name="training"></a>
+
+Our choice of the `score_high` value of 5. was somewhat arbitrary. To use
+more informed value, we can make `score_high` into a [*trainable
+parameter*](https://www.gen.dev/docs/dev/ref/gfi/#Trainable-parameters-1)
+of the generative function. Below, we write a new version of the proposal
+function that makes `score_high` trainable. However, the optimization
+algorithms we will use for training work best with *unconstrained* parameters
+(parameters that can take any value on the real line), but `score_high` must
+be positive. Therefore, we introduce an unconstrained trainable parameter
+mamed `log_score_high`, and use `exp()` to ensure that `score_high` is
+positive:
+
+@gen function custom_dest_proposal_trainable(measurements::Vector{Point}, scene::Scene)
+
+    @param log_score_high::Float64
+    
+    x_first = measurements[1].x
+    x_last = measurements[end].x
+    y_first = measurements[1].y
+    y_last = measurements[end].y
+    
+    # sample dest_x
+    x_probs = compute_bin_probs(num_x_bins, scene.xmin, scene.xmax, x_first, x_last, exp(log_score_high))
+    x_bounds = collect(range(scene.xmin, stop=scene.xmax, length=num_x_bins+1))
+    dest_x ~ piecewise_uniform(x_bounds, x_probs)
+    
+    # sample dest_y
+    y_probs = compute_bin_probs(num_y_bins, scene.ymin, scene.ymax, y_first, y_last, exp(log_score_high))
+    y_bounds = collect(range(scene.ymin, stop=scene.ymax, length=num_y_bins+1))
+    dest_y ~ piecewise_uniform(y_bounds, y_probs)
+    
+    return nothing
+end;
+
+We initialize the value of `score_high` to 1. For this value, our custom
+proposal gives a uniform distribution, and is the same as the default
+proposal.
+
+Gen.init_param!(custom_dest_proposal_trainable, :log_score_high, 0.);
+
+Let's visualize the proposed distribution prior to training to confirm that
+it is a uniform distribution.
+
+visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_trainable, num_samples=1000)
+
+Now, we train the generative function. First, we will require a
+data-generator that generates the training data. The data-generator is a
+function of no arguments that returns a tuple of the form `(inputs, constraints)`. 
+The `inputs` are the arguments to the generative function
+being trained, and the `constraints` contains the desired values of random
+choices made by the function for those arguments. For the training
+distribution, we will use the distribution induced by the generative model
+(`agent_model`), restricted to cases where planning actually succeeded. When
+planning failed, the agent just stays at the same location for all time, and
+we won't worry about tuning our proposal for that case. The training
+procedure will attempt to maximize the expected conditional log probablity
+(density) that the proposal function generates the constrained values,
+when run on the arguments. 
+Note that this is an *average case* objective function --- the resulting proposal 
+distribution may perform better on some data sets than others.
+
+function data_generator()
+    
+    # since these names are used in the global scope, explicitly declare it
+    # local to avoid overwriting the global variable
+    local measurements
+    local choices
+    
+    # obtain an execution of the model where planning succeeded
+    done = false
+    while !done
+        (choices, _, retval) = Gen.propose(agent_model, (scene, dt, num_ticks, planner_params))
+        (planning_failed, maybe_path) = retval       
+        done = !planning_failed
+    end
+
+    # construct arguments to the proposal function being trained
+    measurements = [Point(choices[:meas => (i, :x)], choices[:meas => (i, :y)]) for i=1:num_ticks]
+    inputs = (measurements, scene)
+    
+    # construct constraints for the proposal function being trained
+    constraints = Gen.choicemap()
+    constraints[:dest_x] = choices[:dest_x]
+    constraints[:dest_y] = choices[:dest_y]
+    
+    return (inputs, constraints)
+end;
+
+Next, we choose type of optimization algorithm we will use for training. Gen
+supports a set of gradient-based optimization algorithms (see [Optimizing
+Trainable
+Parameters](https://www.gen.dev/docs/dev/ref/parameter_optimization/#Optimizing-Trainable-Parameters-1)).
+Here we will use gradient descent with a fixed step size of 0.001.
+
+update = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.001), custom_dest_proposal_trainable);
+
+Finally, we use the
+[`Gen.train!`](https://www.gen.dev/docs/stable/ref/inference/#Gen.train!)
+method to actually do the training.
+
+For each epoch, `Gen.train!` makes `epoch_size` calls to the data-generator
+to construct a batch of training data for that epoch. Then, it iteratively
+selects `num_minibatch` subsets of the epoch training data, each of size
+`100`, and applies the update once per minibatch. At the end of the epoch, it
+generates another batch of evaluation data (of size `evaluation_size`) which
+it uses to estimate the objective function (the expected conditional log
+likelihood under the data-generating distribution).
+
+Here, we are running 200 gradient-descent updates, where each update is using
+a gradient estimate obtained from 100 training examples. The method prints
+the estimate of the objective function after each epoch.
+
+@time scores = Gen.train!(custom_dest_proposal_trainable, data_generator, update,
+    num_epoch=200, epoch_size=100, num_minibatch=1, minibatch_size=100, evaluation_size=100, verbose=true);
+
+using Plots
+plot(scores,
+    xlabel="Iterations of stochastic gradient descent",
+    ylabel="Estimate of expected\nconditional log probability density", 
+    label=nothing)
+
+We can read out the new value for `score_high`:
+
+println(exp(Gen.get_param(custom_dest_proposal_trainable, :log_score_high)))
+
+We see that the optimal value of the parameter is indeed larger than our
+initial guess. This validates that the heuristic is indeed a useful one. We
+visualize the proposal distribution below:
+
+visualize_custom_destination_proposal(
+    measurements, start, custom_dest_proposal_trainable, 
+    num_samples=1000)
+
+We can visualize the results of inference, using this newly trained proposal:
+
+visualize_data_driven_inference(
+    measurements, scene, start, custom_dest_proposal_trainable,
+    amt_computation=5, samples=1000)
+
+visualize_data_driven_inference(
+    measurements, scene, start, custom_dest_proposal_trainable,
+    amt_computation=10, samples=1000)
+
+------------
+
+### Exercise
+
+Can you devise a data-driven proposal for the speed of the agent? If so, would you
+expect it to work equally well on all data sets?
+
+<hr>
+
+## 5. Writing and training a deep learning based data-driven proposal <a name="deep"></a>
+
+The heuristic data-driven proposal above gave some improvement in efficiency,
+but it was very simple. One way of constructing complex data-driven proposals
+is to parametrize the proposal with a deep neural network or use another
+class of high-capacity machine learning model (e.g. random forest). Here, we
+will will write a data-driven proposal for the destination of the agent that
+uses deep neural networks. In this section, we do everything manually, without
+the aid of neural network libraries. We also provide an [extension to the tutorial](#) that
+shows how to use PyTorch to make this process a lot easier.
+
+First, we define a sigmoid function for the nonlinearity in our networks.
+
+nonlinearity(x) = 1.7159 * tanh.(x * 0.66666);
+
+We will use a deep neural network with two hidden layers that takes as input
+x- and y- coordinates of the first and last measurement (4 values) and
+produces as output a vector of un-normalized probabilities, one for each bin
+of the x-dimension. We will later sample `:dest_x` from this distribution.
+
+function dest_x_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real, y_last::Real)
+    (W1, b1, W2, b2, W3, b3) = nn_params
+    input_layer = [x_first, y_first, x_last, y_last]
+    hidden_layer_1 = nonlinearity(W1 * input_layer .+ b1)
+    hidden_layer_2 = nonlinearity(W2 * hidden_layer_1 .+ b2)
+    output_layer = exp.(W3 * hidden_layer_2 .+ b3)
+    return output_layer
+end;
+
+After sampling the value of `:dest_x`, we will use a second deep neural
+network with the same structure to sample `:dest_y`.
+
+function dest_y_neural_net(nn_params, x_first::Real, y_first::Real, x_last::Real, y_last::Real)#, dest_x::Real)
+    (W1, b1, W2, b2, W3, b3) = nn_params
+    input_layer = [x_first, y_first, x_last, y_last]
+    hidden_layer_1 = nonlinearity(W1 * input_layer .+ b1)
+    hidden_layer_2 = nonlinearity(W2 * hidden_layer_1 .+ b2)
+    output_layer = exp.(W3 * hidden_layer_2 .+ b3)
+    return output_layer
+end;
+
+Now that we have defined our neural networks, we define our new proposal.
+This generative function has a number of parameters.
+
++
+scale_coord(coord, min, max) = (coord / (max - min)) - 0.5
+
+@gen function custom_dest_proposal_neural(measurements::Vector{Point}, scene::Scene)
+    @param x_W1::Matrix{Float64}
+    @param x_b1::Vector{Float64}
+    @param x_W2::Matrix{Float64}
+    @param x_b2::Vector{Float64}
+    @param x_W3::Matrix{Float64}
+    @param x_b3::Vector{Float64}
+    
+    @param y_W1::Matrix{Float64}
+    @param y_b1::Vector{Float64}
+    @param y_W2::Matrix{Float64}
+    @param y_b2::Vector{Float64}
+    @param y_W3::Matrix{Float64}
+    @param y_b3::Vector{Float64}
+    
+    num_x_bins = length(x_b3)
+    num_y_bins = length(y_b3)
+    
+    # scale inputs to be in the range [-0.5, 0.5]
+    x_first = scale_coord(measurements[1].x, scene.xmin, scene.xmax)
+    x_last = scale_coord(measurements[end].x, scene.xmin, scene.xmax)
+    y_first = scale_coord(measurements[1].y, scene.ymin, scene.ymax)
+    y_last = scale_coord(measurements[end].y, scene.ymin, scene.ymax)
+    
+    # sample dest_x
+    x_bounds = collect(range(scene.xmin, stop=scene.xmax, length=num_x_bins+1))
+    x_probs = dest_x_neural_net((x_W1, x_b1, x_W2, x_b2, x_W3, x_b3), x_first, y_first, x_last, y_last)
+    dest_x ~ piecewise_uniform(x_bounds, x_probs / sum(x_probs))
+    
+    # sample dest_y
+    y_bounds = collect(range(scene.xmin, stop=scene.xmax, length=num_y_bins+1))
+    y_probs = dest_y_neural_net((y_W1, y_b1, y_W2, y_b2, y_W3, y_b3), x_first, y_first, x_last, y_last)
+    dest_y ~ Gen.piecewise_uniform(y_bounds, y_probs / sum(y_probs))
+    
+    return nothing
+end;
+-
+
+We will use 50 hidden units in each of the layers of the two networks:
+
+num_hidden_1 = 50
+num_hidden_2 = 50;
+
+Next, we initialize the parameters:
+
++
+import Random
+Random.seed!(3)
+
+init_weight(shape...) = (1. / sqrt(shape[2])) * randn(shape...)
+
+init_x_W1 = init_weight(num_hidden_1, 4)
+init_x_W2 = init_weight(num_hidden_2, num_hidden_1)
+init_x_W3 = init_weight(num_x_bins, num_hidden_2)
+
+set parameters for dest_x_neural_net predictor network
+init_param!(custom_dest_proposal_neural, :x_W1, init_x_W1)
+init_param!(custom_dest_proposal_neural, :x_b1, zeros(num_hidden_1))
+init_param!(custom_dest_proposal_neural, :x_W2, init_x_W2)
+init_param!(custom_dest_proposal_neural, :x_b2, zeros(num_hidden_2))
+init_param!(custom_dest_proposal_neural, :x_W3, init_x_W3)
+init_param!(custom_dest_proposal_neural, :x_b3, zeros(num_x_bins))
+
+init_y_W1 = init_weight(num_hidden_1, 4)
+init_y_W2 = init_weight(num_hidden_2, num_hidden_1)
+init_y_W3 = init_weight(num_x_bins, num_hidden_2)
+
+set parameters for dest_y_neural_net predictor network
+init_param!(custom_dest_proposal_neural, :y_W1, init_y_W1)
+init_param!(custom_dest_proposal_neural, :y_b1, zeros(num_hidden_1))
+init_param!(custom_dest_proposal_neural, :y_W2, init_y_W2)
+init_param!(custom_dest_proposal_neural, :y_b2, zeros(num_hidden_2))
+init_param!(custom_dest_proposal_neural, :y_W3, init_y_W3)
+init_param!(custom_dest_proposal_neural, :y_b3, zeros(num_y_bins));
+-
+
+Now, we visualize the proposal distribution prior to training:
+
+visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_neural, num_samples=1000)
+
+It looks like the initial distribution is roughly uniform, like the default
+proposal.
+
+Now we train the network stochastic gradient descent with a fixed step size
+of 0.001 that is shared among all of the trainable parameters.
+
+update = Gen.ParamUpdate(Gen.FixedStepGradientDescent(0.001), custom_dest_proposal_neural);
+
+We use 50 epochs of training. In each epoch, we generate 100 training
+examples, and we apply 100 gradient updates, where each update is based on
+the gradient estimate obtained from a random set of 100 of the trainable
+examples. At the end of each epoch, we estimate the objective function value
+using 10000 freshly sampled examples. This process takes about 10 minutes to
+run on a typical laptop CPU, so we have precomputed the results for you.
+
+```julia
+using JLD2
+@time scores = Gen.train!(custom_dest_proposal_neural, data_generator, update,
+    num_epoch=50, epoch_size=100, num_minibatch=100, minibatch_size=100,
+    evaluation_size=1000, verbose=true);
+    
+let data = Dict()
+    for name in [:x_W1, :x_b1, :x_W2, :x_b2, :x_W3, :x_b3, :y_W1, :y_b1, :y_W2, :y_b2, :y_W3, :y_b3]
+        data[(:param, name)] = Gen.get_param(custom_dest_proposal_neural, name)
+    end
+    data[:scores] = scores
+    save("params/custom_dest_proposal_neural_trained.jld2", "data", data)
+end
+```
+
+We load the results here:
+
+```
+using JLD2
+scores = let data = JLD2.load("params/custom_dest_proposal_neural_trained.jld2", "data")
+    for name in [:x_W1, :x_b1, :x_W2, :x_b2, :x_W3, :x_b3, :y_W1, :y_b1, :y_W2, :y_b2, :y_W3, :y_b3]
+        Gen.init_param!(custom_dest_proposal_neural, name, data[(:param, name)])
+    end
+    data[:scores]
+end;
+```
+
+We plot the estimate of the objective function over epochs:
+
+plot(scores,
+    xlabel="Epochs", 
+    ylabel="Estimate of expected\nconditional log probability density", 
+    label=nothing)
+
+Below, we visualize the trained proposal distribution for our data set:
+
+visualize_custom_destination_proposal(measurements, start, custom_dest_proposal_neural, num_samples=1000)
+
+If we run inference with `amt_computation` set to 5, we see that the inferred distribution reflects the bias of the proposal:
+
+visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal_neural,
+    amt_computation=5, samples=1000)
+
+As we increase the amount of computation, the effect of the proposal's bias
+is reduced:
+
+visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal_neural,
+    amt_computation=50, samples=1000)
+
+This bias-correction is more noticeable the more computation we use (though here we only draw 100 approximate posterior samples):
+
+visualize_data_driven_inference(measurements, scene, start, custom_dest_proposal_neural,
+    amt_computation=1000, samples=100)
+
+This example highlights an important aspect of importance sampling: it not
+only _upweights_ guesses that explain the data well; it also _downweights_ guesses
+that are too high-probability under the proposal distribution. That is, if a proposal
+is heavily biased toward one region of the state space, all guesses in that region will
+be downweighted accordingly. That's why, even though (a) guesses in the left and right halves
+of the room are equally likely, and (b) the
+proposal stongly prefers the left half of the room, the importance sampling algorithm
+samples roughly the same number of points in each half of the room.
+
+In the limit of infinite computation, the distribution induced by importance sampling
+converges to the true posterior, independent of the proposal. Indeed, using the
+generic proposal with a high amount of computation produces very similar results:
+
+visualize_inference(measurements, scene, start; computation_amt=1000, samples=100)
\ No newline at end of file
diff --git a/docs/src/tutorials/model_optimizations/scaling_with_sml.md b/docs/src/tutorials/model_optimizations/scaling_with_sml.md
new file mode 100644
index 000000000..29ea8f438
--- /dev/null
+++ b/docs/src/tutorials/model_optimizations/scaling_with_sml.md
@@ -0,0 +1,395 @@
+# [Scaling with the Static Modeling Language](@id scaling_with_sml_tutorial)
+
+## Introduction
+For prototyping models and working with dynamic structures, Gen's [Dynamic Modeling Language](@ref dynamic_modeling_language) is a great (and the default) way of writing probabilistic programs in nearly pure Julia. However, better performance and scaling characteristics can be obtained using specialized modeling languages or modeling constructs. This notebook introduces a more specialized modeling language known as the [Static Modeling Language](@ref sml) (SML) which is also built into Gen. The SML provides model speedups by carefully analyzing what work is necessary during inference.
+
+**Prerequisites for this tutorial**
+- [Generative Combinators](@ref combinators_tutorial)
+
+This tutorial will take the robust regression model used to introduce iterative inference in [an earlier tutorial] and optimize the speed of inference using the SML.
+
+## Slow Inference Program Case Study
+
+```@example sml_tutorial
+using Gen
+using Plots
+
+@gen function model(xs::Vector{Float64})
+    slope ~ normal(0, 2)
+    intercept ~ normal(0, 2)
+    noise ~ gamma(1, 1)
+    prob_outlier ~ uniform(0, 1)
+    
+    n = length(xs)
+    ys = Vector{Float64}(undef, n)
+    
+    for i = 1:n
+        if ({:data => i => :is_outlier} ~ bernoulli(prob_outlier))
+            (mu, std) = (0., 10.)
+        else
+            (mu, std) = (xs[i] * slope + intercept, noise)
+        end
+        ys[i] = {:data => i => :y} ~ normal(mu, std)
+    end
+    ys
+end
+```
+
+We wrote a Markov chain Monte Carlo inference update for this model that performs updates on each of the 'global' parameters (noise, slope, intercept, and prob_outlier), as well as the 'local' `is_outlier` variable associated with each data point. The update takes a trace as input, and returns the new trace as output. We reproduce this here:
+
+```@example sml_tutorial
+function block_resimulation_update(tr)
+    
+    # Block 1: Update the line's parameters
+    line_params = select(:noise, :slope, :intercept)
+    (tr, _) = mh(tr, line_params)
+    
+    # Blocks 2-N+1: Update the outlier classifications
+    (xs,) = get_args(tr)
+    n = length(xs)
+    for i=1:n
+        (tr, _) = mh(tr, select(:data => i => :is_outlier))
+    end
+    
+    # Block N+2: Update the prob_outlier parameter
+    (tr, _) = mh(tr, select(:prob_outlier))
+    
+    # Return the updated trace
+    tr
+end
+```
+
+We write a helper function that takes a vector of y-coordinates and populates a constraints choice map:
+
+```@example sml_tutorial
+function make_constraints(ys::Vector{Float64})
+    constraints = choicemap()
+    for i=1:length(ys)
+        constraints[:data => i => :y] = ys[i]
+    end
+    constraints
+end
+```
+
+Finally, we package this into an inference program that takes the data set of all x- and y-coordinates ,and returns a trace. We will be experimenting with different variants of the model, so we make the model an argument to this function:
+
+```@example sml_tutorial
+function block_resimulation_inference(model, xs, ys)
+    observations = make_constraints(ys)
+    (tr, _) = generate(model, (xs,), observations)
+    for iter=1:500
+        tr = block_resimulation_update(tr)
+    end
+    tr
+end
+```
+
+Let's see how the running time of this inference program changes as we increase the number of data points. We don't expect the running time to depend too much on the actual values of the data points, so we just construct a random data set for each run:
+
+```@example sml_tutorial
+ns = [1, 3, 7, 10, 30, 70, 100]
+times = []
+let # hide
+for n in ns
+    xs = rand(n)
+    ys = rand(n)
+    start = time_ns()
+    tr = block_resimulation_inference(model, xs, ys)
+    push!(times, (time_ns() - start) / 1e9)
+end
+end # hide
+nothing
+```
+
+We now plot the running time versus the number of data points:
+
+```@example sml_tutorial
+
+plot(ns, times, xlabel="number of data points", ylabel="running time (seconds)", label=nothing)
+```
+
+The inference program seems to scale quadratically in the number of data points.
+
+To understand why, consider the block of code inside `block_resimulation_update` that loops over the data points:
+
+```julia
+# Blocks 2-N+1: Update the outlier classifications
+(xs,) = get_args(tr)
+n = length(xs)
+for i=1:n
+    (tr, _) = mh(tr, select(:data => i => :is_outlier))
+end
+```
+
+The reason for the quadratic scaling is that the running time of the call to `mh` inside this loop also grows in proportion to the number of data points. This is because the updates to a trace of a model written the generic built-in modeling language always involve re-running **the entire** model generative function.
+
+However, it should be possible for the algorithm to scale linearly in the number of data points. Briefly, deciding whether to update a given `is_outlier` variable can be done without referencing the other data points. This is because each `is_outiler` variable is conditionally independent of the outlier variables and y-coordinates of the other data points, conditioned on the parameters.
+
+We can make this conditional independence structure explicit using the [Map generative function combinator](https://www.gen.dev/docs/stable/ref/combinators/#Map-combinator-1). Combinators like map encapsulate common modeling patterns (e.g., a loop in which each iteration is making independent choices), and when you use them, Gen can take advantage of the restrictions they enforce to implement performance optimizations automatically during inference. The `Map` combinator, like the `map` function in a functional programming language, helps to execute the same generative code repeatedly. 
+
+## Rewriting the Program with Combinators
+
+To use the map combinator to express the conditional independences in our model, we first write a generative function to generate the `is_outlier` variable and the y-coordinate for a single data point:
+
+```@example sml_tutorial
+@gen function generate_single_point(x::Float64, prob_outlier::Float64, noise::Float64,
+                                    slope::Float64, intercept::Float64)
+    is_outlier ~ bernoulli(prob_outlier)
+    mu  = is_outlier ? 0. : x * slope + intercept
+    std = is_outlier ? 10. : noise
+    y ~ normal(mu, std)
+    return y
+end;
+```
+
+We then apply the [`Map`](https://www.gen.dev/docs/stable/ref/combinators/#Map-combinator-1), which is a Julia function, to this generative function, to obtain a new generative function:
+
+```@example sml_tutorial
+generate_all_points = Map(generate_single_point);
+```
+
+This new generative function has one argument for each argument of `generate_single_point`, except that these arguments are now vector-valued instead of scalar-valued. We can run the generative function on some fake data to test this:
+
+```@example sml_tutorial
+xs = Float64[0, 1, 2, 3, 4]
+prob_outliers = fill(0.5, 5)
+noises = fill(0.2, 5)
+slopes = fill(0.7, 5)
+intercepts = fill(-2.0, 5)
+trace = simulate(generate_all_points, (xs, prob_outliers, noises, slopes, intercepts));
+```
+
+We see that the `generate_all_points` function has traced 5 calls to `generate_single_point`, under namespaces `1` through `5`.  The `Map` combinator automatically adds these indices to the trace address.
+
+```@example sml_tutorial
+get_choices(trace)
+```
+
+Now, let's replace the Julia `for` loop in our model with a call to this new function:
+
+```@example sml_tutorial
+@gen function model_with_map(xs::Vector{Float64})
+    slope ~ normal(0, 2)
+    intercept ~ normal(0, 2)
+    noise ~ gamma(1, 1)
+    prob_outlier ~ uniform(0, 1)
+    n = length(xs)
+    data ~ generate_all_points(xs, fill(prob_outlier, n), fill(noise, n), fill(slope, n), fill(intercept, n))
+    return data
+end;
+```
+
+Note that this new model has the same address structure as our original model had, so our inference code will not need to change. For example, the 5th data point's $y$ coordinate will be stored at the address `:data => 5 => :y`, just as before. (The `:data` comes from our `data ~ ...` invocation in the `better_model` definition, and the `:y` comes from `generate_point`; only the `5` has been inserted automatically by `Map`.)
+
+```@example sml_tutorial
+trace = simulate(model_with_map, (xs,));
+get_choices(trace)
+```
+
+Let's test the running time of the inference program, applied to this new model:
+
+```@example sml_tutorial
+with_map_times = []
+for n in ns
+    xs = rand(n)
+    ys = rand(n)
+    start = time_ns()
+    tr = block_resimulation_inference(model_with_map, xs, ys)
+    push!(with_map_times, (time_ns() - start) / 1e9)
+end
+```
+
+We plot the results and compare them to the original model, which used the Julia `for` loop:
+
+```@example sml_tutorial
+plot(ns, times, label="original", xlabel="number of data points", ylabel="running time (seconds)")
+plot!(ns, with_map_times, label="with map")
+```
+
+We see that the quadratic scaling did not improve. In fact, we actually got a that happed was a constant factor **slowdown**.
+
+We can understand why we still have quadratic scaling, by examining the call to `generate_single_point`:
+
+```@example sml_tutorial
+data ~ generate_all_points(xs, fill(prob_outlier, n), fill(noise, n), fill(slope, n), fill(intercept, n))
+```
+
+Even though the function `generate_all_points` knows that each of the calls to `generate_single_point` is conditionally independent, and even it knows that each update to `is_outlier` only involves a single application of `generate_single_point`, it does not know that **none of its arguments change** within an update to `is_outlier`. Therefore, it needs to visit each call to `generate_single_point`. The generic built-in modeling language does not provide this information the generative functions that it invokes.
+
+## Rewriting in the Static Modeling Language
+
+In order to provide `generate_all_points` with the knowledge that its arguments do not change during an update to the `is_outlier` variable, we need to write the top-level model generative function that calls `generate_all_points` in the [Static Modeling Language](https://www.gen.dev/docs/stable/ref/modeling/#Static-Modeling-Language-1), which is a restricted variant of the built-in modeling language that uses static analysis of the computation graph to generate specialized trace data structures and specialized implementations of trace operations. We indicate that a function is to be interpreted using the static language using the `static` annotation:
+
+```@example sml_tutorial
+@gen (static) function static_model_with_map(xs::Vector{Float64})
+    slope ~ normal(0, 2)
+    intercept ~ normal(0, 2)
+    noise ~ gamma(1, 1)
+    prob_outlier ~ uniform(0, 1)
+    n = length(xs)
+    data ~ generate_all_points(xs, fill(prob_outlier, n), fill(noise, n), fill(slope, n), fill(intercept, n))
+    return data
+end
+```
+
+The static language has a number of restrictions that make it more amenable to static analysis than the unrestricted modeling language. For example, we cannot use Julia `for` loops, and the return value needs to explicitly use the `return` keyword, followed by a symbol (e.g. `data`). Also, each symbol used on the left-hand side of an assignment must be unique. A more complete list of restrictions is given in the documentation.
+
+Below, we show the static dependency graph that Gen builds for this function. Arguments are shown as diamonds, Julia computations are shown as squares, random choices are shown as circles, and calls to other generative function are shown as stars. The call that produces the return value of the function is shaded in blue.
+
+<img src="graph.png" width="100%"/>
+
+Now, consider the update to the `is_outlier` variable:
+```@example sml_tutorial
+(tr, _) = mh(tr, select(:data => i => :is_outlier))
+```
+
+Because this update only causes values under address `:data` to change, the `static_model_with_map` function can use the graph above to infer that none of the arguments to `generate_all_point` could have possibly changed. This will allow us to obtain the linear scaling we expected.
+
+However, before we can use a function written in the static modeling language, we need to run the following function (this is required for technical reasons, because functions written in the static modeling language use a staged programming feature of Julia called *generated functions*).
+
+```
+Gen.@load_generated_functions
+```
+
+Finally, we can re-run the experiment with our model that combines the map combinator with the static language:
+
+```@example sml_tutorial
+static_with_map_times = []
+for n in ns
+    xs = rand(n)
+    ys = rand(n)
+    start = time_ns()
+    tr = block_resimulation_inference(static_model_with_map, xs, ys)
+    push!(static_with_map_times, (time_ns() - start) / 1e9)
+end
+nothing
+```
+
+We compare the results to the results for the earlier models:
+
+```@example sml_tutorial
+plot(ns, times, label="original", xlabel="number of data points", ylabel="running time (seconds)")
+plot!(ns, with_map_times, label="with map")
+plot!(ns, static_with_map_times, label="with map and static outer fn")
+```
+
+We see that we now have the linear running time that we expected.
+
+## Benchmarking the Performance Gain
+
+**Note:** *the following section was drafted using an earlier version of Julia. As of Julia 1.7, the dynamic modeling language is fast enough in some cases that you may not see constant-factor performance gains by switching simple dynamic models, with few choices and no control flow, to use the static modeling language. Based on the experiment below, this model falls into that category.*
+
+Note that in our latest model above, `generate_single_point` was still written in the dynamic modeling language. It is not necessary to write `generate_single_point` in the static language, but doing so might provide modest constant-factor performance improvements. Here we rewrite this function in the static language. The static modeling language does not support `if` statements, but does support ternary expressions (`a ? b : c`):
+
+```@example sml_tutorial
+@gen (static) function static_generate_single_point(x::Float64, prob_outlier::Float64, noise::Float64,
+                                    slope::Float64, intercept::Float64)
+    is_outlier ~ bernoulli(prob_outlier)
+    mu = is_outlier ? 0. : x * slope + intercept
+    std = is_outlier ? 10. : noise
+    y ~ normal(mu, std)
+    return y
+end;
+
+static_generate_all_points = Map(static_generate_single_point);
+
+@gen (static) function fully_static_model_with_map(xs::Vector{Float64})
+    slope ~ normal(0, 2)
+    intercept ~ normal(0, 2)
+    noise ~ gamma(1, 1)
+    prob_outlier ~ uniform(0, 1)
+    n = length(xs)
+    data ~ static_generate_all_points(xs, fill(prob_outlier, n), fill(noise, n), fill(slope, n), fill(intercept, n))
+    return data
+end;
+
+Gen.@load_generated_functions
+```
+
+Now, we re-run the experiment with our new model:
+
+```@example sml_tutorial
+fully_static_with_map_times = []
+let # end
+for n in ns
+    xs = rand(n)
+    ys = rand(n)
+    start = time_ns()
+    tr = block_resimulation_inference(fully_static_model_with_map, xs, ys)
+    push!(fully_static_with_map_times, (time_ns() - start) / 1e9)
+end
+end # hide
+```
+
+In earlier versions of Julia, we saw a modest improvement in running time, but here (running Julia 1.7.1) we see it makes little to no difference:
+
+```@example sml_tutorial
+plot(ns, times, label="original", xlabel="number of data points", ylabel="running time (seconds)")
+plot!(ns, with_map_times, label="with map")
+plot!(ns, static_with_map_times, label="with map and static outer fn")
+plot!(ns, fully_static_with_map_times, label="with map and static outer and inner fns")
+```
+
+# Checking the Inference Programs
+
+Before wrapping up, let's confirm that all of our models are giving good results:
+
+Let's use a synthetic data set:
+
+```@example sml_tutorial
+true_inlier_noise = 0.5
+true_outlier_noise = 10.
+prob_outlier = 0.1
+true_slope = -1
+true_intercept = 2
+xs = collect(range(-5, stop=5, length=50))
+ys = Float64[]
+for (i, x) in enumerate(xs)
+    if rand() < prob_outlier
+        y = 0. + randn() * true_outlier_noise
+    else
+        y = true_slope * x + true_intercept + randn() * true_inlier_noise 
+    end
+    push!(ys, y)
+end
+ys[end-3] = 14
+ys[end-5] = 13;
+
+scatter(xs, ys, xlim=(-7,7), ylim=(-7,15), label=nothing)
+```
+
+We write a trace rendering function that shows the inferred line on top of the observed data set:
+
+```@example sml_tutorial
+function render_trace(trace, title)
+    xs,  = get_args(trace)
+    xlim = [-5, 5]
+    slope = trace[:slope]
+    intercept = trace[:intercept]
+    plot(xlim, slope * xlim .+ intercept, color="black", xlim=(-7,7), ylim=(-7,15), title=title, label=nothing)
+    ys = [trace[:data => i => :y] for i=1:length(xs)]
+    scatter!(xs, ys, label=nothing)
+end;
+```
+
+Finally, we run the experiment. We will visualize just one trace produced by applying our inference program to each of the four variants of our model:
+
+
+```@example sml_tutorial
+tr = block_resimulation_inference(model, xs, ys)
+fig1 = render_trace(tr, "model")
+
+tr = block_resimulation_inference(model_with_map, xs, ys)
+fig2 = render_trace(tr, "model with map")
+
+tr = block_resimulation_inference(static_model_with_map, xs, ys)
+fig3 = render_trace(tr, "static model with map")
+
+tr = block_resimulation_inference(fully_static_model_with_map, xs, ys)
+fig4 = render_trace(tr, "fully static model with map")
+
+plot(fig1, fig2, fig3, fig4)
+```
+
+
+It looks like inference in all the models seems to be working reasonably.
\ No newline at end of file
diff --git a/docs/src/tutorials/reversible_jump.md b/docs/src/tutorials/reversible_jump.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/src/ref/trace_translators.md b/docs/src/tutorials/trace_translators.md
similarity index 98%
rename from docs/src/ref/trace_translators.md
rename to docs/src/tutorials/trace_translators.md
index cec0cf963..47d00dd1c 100644
--- a/docs/src/ref/trace_translators.md
+++ b/docs/src/tutorials/trace_translators.md
@@ -1,6 +1,6 @@
 # Trace Translators
 
-While [Generative Functions](@ref) define probability distributions on traces, **Trace Translators** convert from one space of traces to another space of traces.
+While [generative functions](@ref gfi_api) define probability distributions on traces, **Trace Translators** convert from one space of traces to another space of traces.
 Trace translators are building blocks of inference programs that utilize multiple model representations, like [Involutive MCMC](@ref).
 
 Trace translators are significantly more general than [Bijectors](https://www.tensorflow.org/probability/api_docs/python/tfp/bijectors/Bijector).
@@ -433,19 +433,3 @@ Tips for defining valid transforms:
 - If you find yourself copying the same continuous source address to multiple locations, it probably means your transform is not valid (the Jacobian matrix will have rows that are identical, and so the Jacobian determinant will be zero).
 
 - You can gain some confidence that your transform is valid by enabling dynamic checks (`check=true`) in the trace translator that uses it.
-
-## API
-
-```@docs
-@transform
-@read
-@write
-@copy
-pair_bijections!
-is_involution!
-inverse
-DeterministicTraceTranslator
-GeneralTraceTranslator
-SimpleExtendingTraceTranslator
-SymmetricTraceTranslator
-```
diff --git a/docs/tex/mcmc.tex b/docs/tex/mcmc.tex
deleted file mode 100644
index e381f9b62..000000000
--- a/docs/tex/mcmc.tex
+++ /dev/null
@@ -1,208 +0,0 @@
-\documentclass{article}
-\usepackage{geometry}
-\usepackage[utf8]{inputenc}
-\usepackage{amssymb}
-\usepackage{amsmath}
-\usepackage{amsthm}
-\usepackage{url}
-
-% bibliography
-\PassOptionsToPackage{numbers, compress}{natbib}
-\usepackage{natbib}
-
-\newcommand{\reals}{\mathbb{R}}
-\newcommand{\normal}{\mathcal{N}}
-\newcommand{\pathx}{x}
-\newcommand{\pathy}{y}
-\newcommand{\measx}{\tilde{x}}
-\newcommand{\measy}{\tilde{y}}
-
-\newtheorem{prop}{Proposition}
-
-\title{Efficient Automated Calculation of the Jacobian Determinant for Reversible Jump MCMC}
-\author{Marco Cusumano-Towner}
-\date{}
-
-\begin{document}
-
-\maketitle
-
-\section{Introduction}
-Reversible jump MCMC~\citep{green1995reversible} (RJMCMC) is a general framework for constructing Markov chain Monte Carlo kernels that satisfy detailed balance, while permitting transdimensional and/or deterministic moves.
-Some generic transdimensional MCMC kernels that are implemented by various probabilistic programming systems, including `single-site resimulation Metropolis-Hastings' ~\citep{goodman2008church,wingate2011lightweight} can be justified as instances of RJMCMC.
-However, it is possible to construct much more efficient MCMC kernels that are specialized to a model using the RJMCMC framework.
-The Gen~\citep{cusumano2019gen} probabilistic programming system allows users to program their own custom RJMCMC kernels that are specialized to the model, by specifying an auxiliary generative function called a \emph{proposal} and an \emph{involution} (a function that is its own inverse) that maps tuples of the form $(t, u)$ to tuples of the form $(t', u')$ where $t$ and $t'$ are traces of the model generative function, and $u$ and $u'$ are traces of the proposal generative function.
-In general, both $t$ and $u$ (and by necessity $t'$ and $u'$) may contain the values of continuous random choices, in which case the involution must be constructed from a family of differentiable bijections on the values of these continuous random choices.
-When applying a particular RJMCMC kernel, the determinant of the Jacobian of one of these bijections is required for the acceptance probability calculation.
-This document describes how to implement this Jacobian determinant computation efficiently using automatic differentiation, while exploiting the sparsity of the Jacobian.
-The approach uses a domain-specific language (DSL) for specifying families of bijections that makes the sparsity structure of the Jacobian explicit.
-
-\section{Example}
-
-We consider a variant of the changepoint model that was used to introduce RJMCMC in~\citep{green1995reversible}.
-Briefly, we put a prior on a class of piecewise constant functions on an interval $[0, x_{max}]$ where the number of changepoints is random and Poisson distributed.
-Let $n \in \{1, 2, \ldots, \}$ denote the number of segments, so that the number of changepoints is $n-1 \in \{0, 1, 2, \ldots\}$.
-Let $x_i$ denote changepoint $i$, for $i=1,\ldots,n-1$.
-Then, segment $1$ has range $[0, x_1)$, segment $d$ has range $[x_{n-1}, x_{max}]$ and segment $i$ has range $[x_{i-1}, x_i)$ for $i=2,\ldots,n-1$.
-Let $r_i \in (0, \infty)$ denote the rate for segment $i$ for $i=1,\ldots,n$.
-%There is some likelihood function $l_{\mathcal{D}}(n, \mathbf{x}, \mathbf{r})$ where $\mathcal{D}$ denotes some observed data.
-
-Suppose we are performing a birth-death move as described in~\citep{green1995reversible}, in which we either insert a new changepoint or remove an existing changepoint, and adjust the rates.
-Let $(n, \mathbf{r}, \mathbf{x})$ denote the the previous hypothesis (corresponding to the previous \emph{trace} of the generative function representing the model; we can omit the observations $\mathcal{D}$ from the trace for to simplify notation because they are held fixed).
-The proposal generates the following random choices, dependent on the given trace.
-If $n = 1$, then only a birth move is possible, because there are no changepoints to remove.
-If $n > 1$, then either a birth or death move is possible and we randomly pick sample $b \in \{0, 1\}$, where $b = 1$ indicates a birth move, from a Bernooulli distribution.
-If a birth move is chosen, then the proposal samples an existing segment $i \in \{1, \ldots, n\}$ that will be split into two by the new changepoint from a discrete distribution.
-The proposal then samples a value $u_1 \in (x_{i-1}, x_i)$ for the changepoint from a uniform continuous distribution (where $x_{i-1}$ and $x_i$ are the bounds of segment $i$ in the previous trace); this variable represents the locaion of the newly proposed changepoint.
-The proposal also samples a value $u_2 \in (0, 1)$ that will be used to determine the new rates $r_{i}'$ and $r_{i+1}'$ from the previous rate $r_i$ and $x_i$ and $u_1$ and $x_{i+1}$ (as described in~\citep{green1995reversible} the new rates are determined such that their harmonic mean, weighted by the size of the segment, is equal to the previous rate).
-If a death move is chosen, then the proposal samples an existing changepoint $i \in \{1, \ldots, n-1\}$ to remove.
-
-Suppose the current trace contains $n > 1$ segments, and we are proposing a birth move that is a transition from $n$ segments to $n+1$ segments.
-Then, $\mathbf{r} \in \mathbb{R}^n$ and $\mathbf{r}' \in \mathbb{R}^{n+1}$, and $\mathbf{x} \in \mathbb{R}^{n-1}$ and $\mathbf{x'} \in \mathbb{R}^n$.
-Specificaly, suppose that $n = 3$ and that $i = 2$ (so we are splitting the middle of three segments).
-Then, the differentiable bijection from $(\mathbf{r}, \mathbf{x}, u_1, u_2) \in \mathbb{R}^{7}$ to $(\mathbf{r}', \mathbf{x}') \in \mathbb{R}^{7}$ is:
-\[
-\begin{array}{rl}
-x_1' &= x_1 \\
-x_2' &= u_1 \\
-x_3' &= x_2 \\
-r_1' &= r_1 \\
-r_2' &= f_1(r_2, x_1, u_1, x_2, u_2) \\
-r_3' &= f_2(r_2, x_1, u_1, x_2, u_2)\\
-r_4' &= r_3
-\end{array}
-\]
-for some functions $f_1$ and $f_2$ whose details will not concern us here.
-Then, the Jacobian (where columns are inputs and rows are outputs) is as follows, where rows and columns are labeled:
-\[
-\begin{array}{c|ccccccc}
-&       x_1 & x_2 & r_1 &   r_2 &   r_3 &   u_1 &   u_2 \\
-\hline
-x_1'  & 1   & 0   & 0   &   0   &   0   &   0   &   0   \\
-x_2'  & 0   & 0   & 0   &   0   &   0   &   1   &   0   \\
-x_3'  & 0   & 1   & 0   &   0   &   0   &   0   &   0   \\
-r_1'  & 0   & 0   & 1   &   0   &   0   &   0   &   0   \\
-r_2'  & {\scriptstyle \partial f_1 / \partial x_1}
-      & {\scriptstyle \partial f_1 / \partial x_2}
-      & 0
-      & {\scriptstyle \partial f_1 / \partial r_2}
-      & 0
-      & {\scriptstyle \partial f_1 / \partial u_1}
-      & {\scriptstyle \partial f_1 / \partial u_2}   \\
-r_3'  & {\scriptstyle \partial f_2 / \partial x_1}
-      & {\scriptstyle \partial f_2 / \partial x_2}
-      & 0
-      & {\scriptstyle \partial f_2 / \partial r_2}
-      & 0
-      & {\scriptstyle \partial f_2 / \partial u_1}
-      & {\scriptstyle \partial f_2 / \partial u_2}   \\
-r_4'  & 0   & 0   & 0   &   0   &   1   &   0   &   0
-\end{array}
-\]
-Note that the following rows have a single 1 and all other entries 0: $x_1', x_2', x_3', r_1', r_4'$.
-Therefore, by cofactor expansion, the absolute value of the determinant of this matrix is equivalent to the absolute value of the determinant of the following submatrix of the Jacobian:
-\[
-\begin{array}{c|ccccccc}
-&       r_2 &   u_2 \\
-\hline
-r_2'  & {\scriptstyle \partial f_1 / \partial r_2}   &   {\scriptstyle \partial f_1 / \partial u_2}   \\
-r_3'  & {\scriptstyle \partial f_2 / \partial r_2}   &   {\scriptstyle \partial f_2 / \partial u_2}   \\
-\end{array}
-\]
-To compute the Jacobian determinant, it turns out that we only need to compute $4$ of the $49$ elements of the Jacobian.
-Also, somewhat surprisingly, we do not need to compute all elements that are not $0$ or $1$.
-
-\section{Automatically Exploiting the Sparse Jacobian Structure}
-
-This section describes how to automatically perform the optimization recognized above in a general setting for arbitrary generative models and RJMCMC kernels, while using automatic differentiation.
-Let $t \in \mathbb{R}^K$ denote the vector of values of continuous random choices in the previous model trace.
-Let $u \in \mathbb{R}^L$ denote the vector of values of continuous random choices in the proposal trace.
-Similarly, let $t' \in \mathbb{R}^{K'}$ and $u' \in \mathbb{R}^{L'}$ denote the vector of values of continuous random choices in the new model trace, and new proposal trace.
-We assume that $K + L = K' + L'$ (this is the \emph{dimension matching} requirement), and that we are given a bijection $h : \mathbb{R}^{K + L} \to \mathbb{R}^{K + L}$ that is differentiable and whose Jacobian has nonzero determinant everwhere.\footnote{It may be possible to generalize and allow functions that are not differentiable at a measure zero set of points.}
-This section discusses how to efficiently compute the absolute value of the determinant of the Jacobian of $h$, which we denote by $J \in \mathbb{R}^{(K + L) \times (K + L)}$.
-
-$J$ is often sparse due to the fact that often a large fraction of the entries in $(t', u')$ are obtained by \emph{copying} the value of some entry in $(t, u)$.
-We denote the subvector of entries of $t'$ that are produced by copying as $t'_c$ and similarly for $u'$ (subvector of copies is denoted $u'_c$).
-We denote the subvectors containing other entries of $t'$ and $u'$ by $t'_w$ and $u'_w$ respectively (where $w$ stands for `write').
-Let $t_c$ and $u_c$ denote the subvectors of $t$ and $u$ respectively such that each entry is copied to at least one entry in $(t', u')$.
-We denote the subvectors containing the remaining entries of $t$ and $u$ as $t_r$ and $u_r$ (where $r$ stands for `read').
-The Jacobian of $h$ (up to reordering of rows and columns, which does not change the absolute value of the determinant) then has the following block structure, where white space indicates a zero submatrix (and where sets of rows and columns are labeled on right):
-\[
-J = \left[
-\begin{array}{cccc}
-J_{11}  &   J_{12}  &     &   \\
-J_{21}  &   J_{22}  &     &   \\
-J_{31}  &   J_{32}  &   J_{33}  &   J_{34}\\
-J_{41}  &   J_{42}  &   J_{43} &   J_{44}
-\end{array}
-\right]
-\;\;\;\;\;
-\begin{array}{c|cccc}
-&       t_c    &   u_c &   t_r &   u_r\\
-\hline
-t'_c  &  J_{11}  &   J_{12}  &     &   \\
-u'_c  &  J_{21}  &   J_{22}  &     &   \\
-t'_w  &  J_{31}  &   J_{32}  &   J_{33}  &   J_{34}\\
-u'_w  &  J_{41}  &   J_{42}  &   J_{43} &   J_{44}
-\end{array}
-\]
-
-We will show that the absolute value of the determinant of this matrix is the same as the absolute value of the determinant of the following submatrix (which we will show is square), which much smaller than $J$ in the common case when a reversible jump move only modifies a subset of the random choices in a model:
-\[
-\left[
-\begin{array}{cc}
-J_{33}  &   J_{34}\\
-J_{43} &   J_{44}
-\end{array}
-\right]
-\]
-
-\begin{prop}
-The submatrix $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ is square.
-\end{prop}
-\begin{proof}
-Each row of $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ contains exactly one $1$ and all other entries are $0$, by the definition of $t_c'$ and $u_c'$.
-Each column of $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ contains at least one $1$, by the definition of $t_c$ and $u_c$.
-Therefore, the number of rows must be greater than or equal to the number of columns.
-Since we assume that $J$ has nonzero determinant, all its rows must be linearly independent.
-Together with the block structure of $J$ shown above, this implies that rows of $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ are linearly independent, and in particular that no two rows are identical.
-Since each row of $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ contains exactly one $1$, the number of columns in $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ must be greater than or equal to the number of rows.
-\end{proof}
-
-\begin{prop}
-The submatrix $[J_{33} \; J_{34}; J_{43} \; J_{44}]$ is square.
-\end{prop}
-\begin{proof}
-$J$ is square and $[J_{11} \; J_{12}; J_{21} \; J_{22}]$ is square.
-\end{proof}
-
-\begin{prop}
-$|\det(J)| = |\det([J_{33} \; J_{34}; J_{43} \; J_{44}])|$.
-\end{prop}
-\begin{proof}
-Each row of $J$ corresponding to an entry of $t_c'$ or an entry of $u_c'$ has a single $1$ entry and other entries $0$ by definition of $t_c'$ and $u_c'$.
-Then, the result follows by consecutive co-factor expansion of the determinant of $J$, deleting all rows in $t_c'$ and all columns in $u_c'$.
-\end{proof}
-
-Automating the computation of $|\det(J)|$ by computing instead the determinant of the submatrix allows us to avoid evaluating the entire $J$, which scales quadratically in the maximum number of random choices in the model, and allows us to avoid computing the full determinant, which scales cubically (albeit likely smaller smaller lower-order factors than computing the Jacobian itself).
-We only need to evaluate the submatrix $[J_{33} \; J_{34}; J_{43} \; J_{44}]$.
-However, this requires knowing which entries in $t$ and $u$ correspond to $(t_r, u_r)$, and the which entries in $t'$ and $u'$ correspond to $(t'_w, u'_w)$.
-This is made possible using a domain-specific language (DSL) for defining the bijection $h$ with separate language constructs for:
-\begin{enumerate}
-\item Reading the values of continuous random choices from $t$ and $u$.
-\item Writing the values of continuous random choices to $t'$ and $u'$.
-\item Copying the values of continuous random choices from $t$ and $u$ to $t'$ and $u'$.
-If the user instead express a copy using a read operation followed by a write operation, but this will result in unecessary extra computation.
-\item Tagging read operations for of continuous random choices read from $t$ as \emph{retained}, which asserts that the random choice will not be deleted by the reversible jump move.
-If the same address is also not written to in $t'$, and the user assertion is true, then the random choice will be copied implicitly from $t$, which means it can be treated as part of $t_c$, reducing the size of the Jacobian submatrix that needs to be computed.
-This is optional -- not tagging a read operation as retained when it is results in unecessary computation.
-\end{enumerate}
-Automatic differentiation is used to compute the entries of $[J_{33} \; J_{34}; J_{43} \; J_{44}]$.
-Finally, note that specification of the involution on discrete random choices may or may not be separated from the specification of the family of bijections $h$ on the continuous random choices.
-In the current Gen implementation, a single unified DSL is used to specify the entire involution and the approach described in this document is used for Jacobian determinant correction computation.
-
-\bibliographystyle{unsrt}
-\bibliography{references} 
-
-
-\end{document}
diff --git a/docs/tex/references.bib b/docs/tex/references.bib
deleted file mode 100644
index f7bd2a17d..000000000
--- a/docs/tex/references.bib
+++ /dev/null
@@ -1,39 +0,0 @@
-@article{green1995reversible,
-  title={Reversible jump {{M}}arkov chain {{M}}onte {{C}}arlo computation and {{B}}ayesian model determination},
-  author={Green, Peter J},
-  journal={Biometrika},
-  volume={82},
-  number={4},
-  pages={711--732},
-  year={1995},
-  publisher={Oxford University Press}
-}
-
-@inproceedings{goodman2008church,
-title         = {Church: a language for generative models},
-author        = {Goodman, Noah and Mansinghka, Vikash and Roy, Daniel M. and Bonawitz, Keith and Tenenbaum, Joshua B.},
-booktitle     = {Proceedings of the 24th Annual Conference on Uncertainty in Artificial Intelligence},
-series        = {UAI 2008},
-pages         = {220--229},
-year          = {2008},
-publisher     = {AUAI Press},
-}
-
-@inproceedings{wingate2011lightweight,
-  title={Lightweight Implementations of Probabilistic Programming Languages Via Transformational Compilation.},
-  author={Wingate, David and Stuhlm{\"u}ller, Andreas and Goodman, Noah D},
-  booktitle={AISTATS},
-  pages={770--778},
-  year={2011}
-}
-
-@inproceedings{cusumano2019gen,
-  title={Gen: a general-purpose probabilistic programming system with programmable inference},
-  author={Cusumano-Towner, Marco F and Saad, Feras A and Lew, Alexander K and Mansinghka, Vikash K},
-  booktitle={Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation},
-  pages={221--236},
-  year={2019},
-  organization={ACM}
-}
-
-
diff --git a/src/address.jl b/src/address.jl
index fe0fb30e9..8236a7e9c 100644
--- a/src/address.jl
+++ b/src/address.jl
@@ -29,7 +29,7 @@ export AllAddressSchema
 ######################
 
 """
-    abstract type Selection end
+    Selection
 
 Abstract type for selections of addresses.
 
@@ -112,6 +112,11 @@ export AllSelection
 # complement selection #
 ########################
 
+"""
+    struct ComplementSelection <: Selection end
+
+A hierarchical selection that is the complement of the given selection. An address is in the selection if it is not in the complement selection.
+"""
 struct ComplementSelection <: Selection
     complement::Selection
 end
diff --git a/src/choice_map.jl b/src/choice_map.jl
index fc72524cd..3efe6a48d 100644
--- a/src/choice_map.jl
+++ b/src/choice_map.jl
@@ -362,6 +362,11 @@ export to_array, from_array
 # static assignment #
 ######################
 
+"""
+    StaticChoiceMap <: ChoiceMap
+
+An immutable mapping statically-traced addresses to values.
+"""
 struct StaticChoiceMap{R,S,T,U} <: ChoiceMap
     leaf_nodes::NamedTuple{R,S}
     internal_nodes::NamedTuple{T,U}
@@ -621,6 +626,12 @@ export pair, unpair
 # dynamic assignment #
 #######################
 
+"""
+    DynamicChoiceMap <: ChoiceMap
+
+A mutable map from arbitrary hierarchical addresses to values.
+
+"""
 struct DynamicChoiceMap <: ChoiceMap
     leaf_nodes::Dict{Any,Any}
     internal_nodes::Dict{Any,Any}
@@ -632,22 +643,19 @@ end
 # invariant: all internal nodes are nonempty
 
 """
-    struct DynamicChoiceMap <: ChoiceMap .. end
-
-A mutable map from arbitrary hierarchical addresses to values.
-
     choices = DynamicChoiceMap()
 
 Construct an empty map.
-
-    choices = DynamicChoiceMap(tuples...)
-
-Construct a map containing each of the given (addr, value) tuples.
 """
 function DynamicChoiceMap()
     DynamicChoiceMap(Dict(), Dict())
 end
 
+"""
+    choices = DynamicChoiceMap(tuples...)
+
+Construct a map containing each of the given (addr, value) tuples.
+"""
 function DynamicChoiceMap(tuples...)
     choices = DynamicChoiceMap()
     for tuple in tuples
@@ -906,6 +914,11 @@ export vectorize_internal
 # empty assignment #
 ####################
 
+"""
+    EmptyChoiceMap <: ChoiceMap
+
+An empty choice map.
+"""
 struct EmptyChoiceMap <: ChoiceMap end
 
 Base.isempty(::EmptyChoiceMap) = true
diff --git a/src/diff.jl b/src/diff.jl
index 6af2cdd24..ec0c41eb7 100644
--- a/src/diff.jl
+++ b/src/diff.jl
@@ -25,26 +25,29 @@
 # a value with type Diffed should never appear outside of the update context.
 
 """
-    abstract type Diff end
+    Diff
 
-Abstract type for information about a change to a value.
+Abstract supertype for information about a change to a value.
 """
 abstract type Diff end
 
 """
     UnknownChange
 
-No information is provided about the change to the value.
+Singleton to indicate the change to the value is unknown or unprovided.
 """
 struct UnknownChange <: Diff end
 
 """
     NoChange
 
-The value did not change.
+Singleton to indicate the value did not change.
 """
 struct NoChange <: Diff end
 
+"""
+    SetDiff <: Diff
+"""
 struct SetDiff{V} <: Diff
 
     # elements that were added
diff --git a/src/inference/trace_translators.jl b/src/inference/trace_translators.jl
index 67591627a..7e5bfde54 100644
--- a/src/inference/trace_translators.jl
+++ b/src/inference/trace_translators.jl
@@ -156,6 +156,11 @@ macro transform(f_expr, src_expr, to_symbol::Symbol, dest_expr, body)
     end
 end
 
+"""
+    @tcall(ex)
+
+A macro to call a transformation function from inside another transformation fuction.
+"""
 macro tcall(ex)
     MacroTools.@capture(ex, f_(args__)) || error("expected syntax: f(..)")
     return quote $(esc(f)).fn!($(esc(bij_state)), $(map(esc, args)...)) end
@@ -733,7 +738,7 @@ Run the translator with:
     (output_trace, log_weight) = translator(input_trace; check=false, prev_observations=EmptyChoiceMap())
 
 Use `check` to enable a bijection check (this requires that the transform `f` has been
-paired with its inverse using [`pair_bijections!](@ref) or [`is_involution`](@ref)).
+paired with its inverse using [`pair_bijections!](@ref) or [`is_involution!`](@ref)).
 
 If `check` is enabled, then `prev_observations` is a choice map containing the observed
 random choices in the previous trace.
@@ -872,7 +877,7 @@ Run the translator with:
     (output_trace, log_weight) = translator(input_trace; check=false, observations=EmptyChoiceMap())
 
 Use `check` to enable the involution check (this requires that the transform `f` has been
-marked with [`is_involution`](@ref)).
+marked with [`is_involution!`](@ref)).
 
 If `check` is enabled, then `observations` is a choice map containing the observed random
 choices, and the check will additionally ensure they are not mutated by the involution.
diff --git a/src/modeling_library/recurse/recurse.jl b/src/modeling_library/recurse/recurse.jl
index 7b4d1d23d..88bd5dd7d 100644
--- a/src/modeling_library/recurse/recurse.jl
+++ b/src/modeling_library/recurse/recurse.jl
@@ -110,7 +110,7 @@ end
 # production_kern::GenerativeFunction{Tuple{V,Vector{U}},S}
 # aggregation_kern::GenerativeFunction{W,T}
 
-""""
+"""
     Recurse(production_kernel, aggregation_kernel, max_branch,
          ::Type{U}, ::Type{V}, ::Type{W})
 
diff --git a/src/modeling_library/switch/update.jl b/src/modeling_library/switch/update.jl
index d14af1278..49cf48cbb 100644
--- a/src/modeling_library/switch/update.jl
+++ b/src/modeling_library/switch/update.jl
@@ -10,6 +10,11 @@ mutable struct SwitchUpdateState{T}
     SwitchUpdateState{T}(weight::Float64, score::Float64, noise::Float64, prev_trace::Trace) where T = new{T}(weight, score, noise, prev_trace)
 end
 
+"""
+    update_recurse_merge(prev_choices::ChoiceMap, choices::ChoiceMap)
+
+Returns choices that are in constraints, merged with all choices in the previous trace that do not have the same address as some choice in the constraints."
+"""
 function update_recurse_merge(prev_choices::ChoiceMap, choices::ChoiceMap)
     prev_choice_submap_iterator = get_submaps_shallow(prev_choices)
     prev_choice_value_iterator = get_values_shallow(prev_choices)
@@ -48,13 +53,14 @@ function update_recurse_merge(prev_choices::ChoiceMap, choices::ChoiceMap)
     return new_choices
 end
 
-@doc(
-"""
-update_recurse_merge(prev_choices::ChoiceMap, choices::ChoiceMap)
 
-Returns choices that are in constraints, merged with all choices in the previous trace that do not have the same address as some choice in the constraints."
-""", update_recurse_merge)
+"""
+    update_discard(prev_choices::ChoiceMap, choices::ChoiceMap, new_choices::ChoiceMap)
 
+Returns choices from previous trace that:
+   1. have an address which does not appear in the new trace.
+   2. have an address which does appear in the constraints.
+"""
 function update_discard(prev_choices::ChoiceMap, choices::ChoiceMap, new_choices::ChoiceMap)
     discard = choicemap()
     for (k, v) in get_submaps_shallow(prev_choices)
@@ -71,14 +77,6 @@ function update_discard(prev_choices::ChoiceMap, choices::ChoiceMap, new_choices
     discard
 end
 
-@doc(
-"""
-update_discard(prev_choices::ChoiceMap, choices::ChoiceMap, new_choices::ChoiceMap)
-
-Returns choices from previous trace that:
-   1. have an address which does not appear in the new trace.
-   2. have an address which does appear in the constraints.
-""", update_discard)
 
 @inline update_discard(prev_trace::Trace, choices::ChoiceMap, new_trace::Trace) = update_discard(get_choices(prev_trace), choices, get_choices(new_trace))
 
diff --git a/src/modeling_library/vector.jl b/src/modeling_library/vector.jl
index f35360b50..7dea5ae28 100644
--- a/src/modeling_library/vector.jl
+++ b/src/modeling_library/vector.jl
@@ -5,6 +5,7 @@ using FunctionalCollections: PersistentVector, assoc, push, pop
 ###########################################
 
 """
+    VectorTrace <: Trace
 
 U is the type of the subtrace, R is the return value type for the kernel
 """
diff --git a/test/dsl/dynamic_dsl.jl b/test/dsl/dynamic_dsl.jl
index 25e3a50f4..2385a4b44 100644
--- a/test/dsl/dynamic_dsl.jl
+++ b/test/dsl/dynamic_dsl.jl
@@ -526,7 +526,7 @@ end
     """
     @gen function foo(x)
             return x + 1
-        end
+    end
 
     io = IOBuffer()
     print(io, @doc foo)