diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2e76bff..1460ef2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -54,7 +54,6 @@
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
     "sphinx.ext.intersphinx",
-    "sphinx_mdinclude",
     "sphinx_immaterial",
     "sphinx_immaterial.apidoc.python.apigen",
     "sphinx.ext.linkcode",
diff --git a/graphicle/calculate.py b/graphicle/calculate.py
index 9d62e70..4ba1073 100644
--- a/graphicle/calculate.py
+++ b/graphicle/calculate.py
@@ -415,7 +415,7 @@ def flow_trace(
     return traces
 
 
-@nb.njit("float64[:](float64[:], float64[:], float64)")
+@nb.njit("float64[:](float64[:], float64[:], float64)", cache=True)
 def _rapidity(
     energy: base.DoubleVector, z: base.DoubleVector, zero_tol: float
 ) -> base.DoubleVector:
@@ -447,7 +447,7 @@ def _rapidity(
     return rap
 
 
-@nb.vectorize("float64(float64, float64)")
+@nb.vectorize("float64(float64, float64)", cache=True)
 def _root_diff_two_squares(
     x1: base.DoubleUfunc, x2: base.DoubleUfunc
 ) -> base.DoubleUfunc:
@@ -484,6 +484,7 @@ def _root_diff_two_squares(
 @nb.njit(
     "float64[:, :](float64[:], float64[:], complex128[:], complex128[:])",
     parallel=True,
+    cache=True,
 )
 def _delta_R(
     rapidity_1: base.DoubleVector,
@@ -522,7 +523,7 @@ def _delta_R(
     return result
 
 
-@nb.njit("float64[:, :](float64[:], complex128[:])", parallel=True)
+@nb.njit("float64[:, :](float64[:], complex128[:])", parallel=True, cache=True)
 def _delta_R_symmetric(
     rapidity: base.DoubleVector, xy_pol: base.ComplexVector
 ) -> base.DoubleVector:
@@ -559,7 +560,7 @@ def _delta_R_symmetric(
     return result
 
 
-@nb.njit("float32[:](bool_[:, :])", parallel=True)
+@nb.njit("float32[:](bool_[:, :])", parallel=True, cache=True)
 def _clust_coeffs(adj: base.BoolVector) -> base.FloatVector:
     num_nodes = adj.shape[0]
     coefs = np.empty(num_nodes, dtype=np.float32)
@@ -652,3 +653,32 @@ def aggregate_momenta(
     pmus = map(fn.partial(op.getitem, pmu), cluster_masks)
     pmu_sums = map(fn.partial(np.sum, axis=0), pmus)
     return momentum_class(list(it.chain.from_iterable(pmu_sums)))
+
+
+@nb.njit(
+    "float64[:, :](float64[:], float64[:], complex128[:], complex128[:])",
+    parallel=True,
+    cache=True,
+)
+def _assignment_cost(
+    rapidity_1: base.DoubleVector,
+    rapidity_2: base.DoubleVector,
+    xy_pol_1: base.ComplexVector,
+    xy_pol_2: base.ComplexVector,
+) -> base.DoubleVector:
+    dist_matrix = _delta_R(rapidity_1, rapidity_2, xy_pol_1, xy_pol_2)
+    pt_2 = rapidity_2  # recycle memory buffer for transverse momenta
+    for pol_idx, pol_val in enumerate(xy_pol_2):
+        pt_2[pol_idx] = abs(pol_val)
+    var_pt_recip = 1.0 / np.var(pt_2)
+    num_partons = dist_matrix.shape[0]
+    for parton_idx in nb.prange(num_partons):
+        row = dist_matrix[parton_idx, :]
+        pt_1_val = abs(xy_pol_1[parton_idx])
+        var_dR_recip = 1.0 / np.var(row)
+        for jet_idx, (dR_val, pt_2_val) in enumerate(zip(row, pt_2)):
+            dpt = pt_1_val - pt_2_val
+            dR_cost = math.expm1(-0.5 * var_dR_recip * dR_val * dR_val)
+            pt_cost = math.expm1(-0.5 * var_pt_recip * dpt * dpt)
+            row[jet_idx] = -(dR_cost + pt_cost)
+    return dist_matrix
diff --git a/graphicle/select.py b/graphicle/select.py
index 3661c64..4aeb71c 100644
--- a/graphicle/select.py
+++ b/graphicle/select.py
@@ -102,13 +102,36 @@ def fastjet_clusters(
         List containing masks over the input data for each jet
         clustering, in order of descending :math:`p_T`.
 
+    Raises
+    ------
+    ValueError
+        When a negative value is passed to ``eta_cut``, ``pt_cut``, or
+        ``radius``. Additionally, when ``top_k`` is passed as either a
+        non-integer, or with a value less than one.
+
     Notes
     -----
     ``p_val`` set to ``-1`` gives **anti-kT**, ``0`` gives
     **Cambridge-Aachen**, and ``1`` gives **kT** clusterings.
+
+    To prevent expensive repeated memory allocations, the underlying
+    masks are stored as a single contiguous array, where each row is the
+    data for the respective ``MaskArray`` in the output list. This may
+    cause undefined behaviour if you apply views on the underlying data
+    in a ``MaskArray`` without copying it.
     """
+    if pt_cut is None:
+        pt_cut = 0.0
+    elif pt_cut < 0.0:
+        raise ValueError("pt_cut must be non-negative.")
+    if radius < 0.0:
+        raise ValueError("radius must be non-negative.")
+    if (top_k is not None) and ((top_k < 1) or (not isinstance(top_k, int))):
+        raise ValueError(
+            "top_k must be an integer with a value greater than zero."
+        )
     num_pcls = len(pmu)
-    pmu_renamed = pmu.data.copy()[["e", "x", "y", "z"]]
+    pmu_renamed = pmu.data[["e", "x", "y", "z"]].copy()
     pmu_renamed.dtype.names = "E", "px", "py", "pz"
     pmu_ak = ak.from_numpy(pmu_renamed)
     extra_param = tuple()
@@ -121,30 +144,24 @@ def fastjet_clusters(
     else:
         algo = fj.genkt_algorithm
         extra_param = (p_val,)
-    jetdef = fj.JetDefinition(algo, radius, *extra_param)
-    sequence = fj.ClusterSequence(pmu_ak, jetdef)
-    jets = sequence.inclusive_jets()
+    jet_def = fj.JetDefinition(algo, radius, *extra_param)
+    sequence = fj.ClusterSequence(pmu_ak, jet_def)
+    jets = sequence.inclusive_jets(pt_cut)
     jet_pmus = gcl.MomentumArray(jets.to_numpy())
-    num_jets = len(jet_pmus)
     pt_descend_idxs = np.argsort(jet_pmus.pt)[::-1].tolist()
-    jet_pmus = jet_pmus[pt_descend_idxs]
-    cuts = np.ones(num_jets, dtype=np.bool_)
-    if pt_cut is not None:
-        cuts = jet_pmus.pt > pt_cut
+    jet_idxs_ = sequence.constituent_index(pt_cut)[pt_descend_idxs]
     if eta_cut is not None:
-        cuts = np.logical_and(cuts, np.abs(jet_pmus.eta) < eta_cut)
-    jet_idxs = sequence.constituent_index().to_list()
-    jet_idxs = op.itemgetter(*pt_descend_idxs)(jet_idxs)
-    jet_idxs = it.compress(jet_idxs, cuts)
-    if top_k is not None:
-        jet_idxs = it.islice(jet_idxs, top_k)
-    mask_empty = gcl.MaskArray(np.zeros(num_pcls, dtype="<?"))
-    clusters: ty.List[gcl.MaskArray] = []
-    for jet_idx in jet_idxs:
-        mask = mask_empty.copy()
+        if eta_cut < 0.0:
+            raise ValueError("eta_cut must be non-negative.")
+        jet_pmus = jet_pmus[pt_descend_idxs]
+        jet_idxs_ = jet_idxs_[np.abs(jet_pmus.eta) < eta_cut]
+    jet_idxs_ = jet_idxs_[:top_k]
+    jet_idxs = jet_idxs_.to_list()
+    num_jets = len(jet_idxs)
+    cluster_buffer = np.zeros((num_jets, num_pcls), dtype=np.bool_)
+    for mask, jet_idx in zip(cluster_buffer, jet_idxs):
         mask[jet_idx] = True
-        clusters.append(mask)
-    return clusters
+    return list(map(gcl.MaskArray, cluster_buffer))
 
 
 def find_vertex(
@@ -1145,7 +1162,9 @@ def clusters(
 
 
 def arg_closest(
-    focus: gcl.MomentumArray, candidate: gcl.MomentumArray
+    focus: gcl.MomentumArray,
+    candidate: gcl.MomentumArray,
+    num_threads: int = 1,
 ) -> ty.List[int]:
     """Assigns four-momenta elements in ``candidate`` to the nearest
     four-momenta elements in ``focus``. Elements in ``candidate`` are
@@ -1155,6 +1174,9 @@ def arg_closest(
 
     .. versionadded:: 0.2.14
 
+    .. versionchanged:: 0.3.8
+       Modified the distance metric to include transverse momentum.
+
     Parameters
     ----------
     focus : MomentumArray
@@ -1163,6 +1185,9 @@ def arg_closest(
     candidate : MomentumArray
         Four-momenta of candidate objects to draw from until ``focus``
         objects have each received an assignment.
+    num_threads : int
+        Number of threads to parallelise the cost matrix computation
+        over. Default is 1.
 
     Returns
     -------
@@ -1196,7 +1221,15 @@ def arg_closest(
        Systems*, 52(4):1679-1696, August 2016,
        :doi:`10.1109/TAES.2016.140952`
     """
-    _, idxs = opt.linear_sum_assignment(focus.delta_R(candidate, pseudo=False))
+    with gcl.calculate._thread_scope(num_threads):
+        cost_matrix = gcl.calculate._assignment_cost(
+            focus.rapidity,
+            candidate.rapidity,
+            focus._xy_pol,
+            candidate._xy_pol,
+        )
+    del candidate.rapidity  # inplace operation above invalidates cache
+    _, idxs = opt.linear_sum_assignment(cost_matrix)
     return idxs.tolist()
 
 
@@ -1343,6 +1376,11 @@ def monte_carlo_tag(
     if clustered_pmu is None:
         clustered_pmu = particles.pmu[particles.final]
         ref_length = "particles.final"
+    if len(cluster_masks) < len(hard_pmu):
+        raise ValueError(
+            f"shape mismatch: only {len(cluster_masks)} clusters "
+            f"passed to tag {len(hard_pmu)} partons."
+        )
     if len(clustered_pmu) != len(cluster_masks[0]):
         raise ValueError(
             "shape mismatch: length of elements in cluster_masks must be the "