diff --git a/api_docs/audiocraft/adversarial/discriminators/base.html b/api_docs/audiocraft/adversarial/discriminators/base.html
new file mode 100644
index 00000000..fe4eb186
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/discriminators/base.html
@@ -0,0 +1,205 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.discriminators.base API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.discriminators.base</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+import typing as tp
+
+import torch
+import torch.nn as nn
+
+
+FeatureMapType = tp.List[torch.Tensor]
+LogitsType = torch.Tensor
+MultiDiscriminatorOutputType = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
+
+
+class MultiDiscriminator(ABC, nn.Module):
+    &#34;&#34;&#34;Base implementation for discriminators composed of sub-discriminators acting at different scales.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        ...
+
+    @property
+    @abstractmethod
+    def num_discriminators(self) -&gt; int:
+        &#34;&#34;&#34;Number of discriminators.
+        &#34;&#34;&#34;
+        ...</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator"><code class="flex name class">
+<span>class <span class="ident">MultiDiscriminator</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base implementation for discriminators composed of sub-discriminators acting at different scales.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiDiscriminator(ABC, nn.Module):
+    &#34;&#34;&#34;Base implementation for discriminators composed of sub-discriminators acting at different scales.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        ...
+
+    @property
+    @abstractmethod
+    def num_discriminators(self) -&gt; int:
+        &#34;&#34;&#34;Number of discriminators.
+        &#34;&#34;&#34;
+        ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator" href="mpd.html#audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator">MultiPeriodDiscriminator</a></li>
+<li><a title="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator" href="msd.html#audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator">MultiScaleDiscriminator</a></li>
+<li><a title="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator" href="msstftd.html#audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator">MultiScaleSTFTDiscriminator</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators"><code class="name">var <span class="ident">num_discriminators</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Number of discriminators.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def num_discriminators(self) -&gt; int:
+    &#34;&#34;&#34;Number of discriminators.
+    &#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators" href="index.html">audiocraft.adversarial.discriminators</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators">num_discriminators</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.training" href="#audiocraft.adversarial.discriminators.base.MultiDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/discriminators/index.html b/api_docs/audiocraft/adversarial/discriminators/index.html
new file mode 100644
index 00000000..9eaf91b5
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/discriminators/index.html
@@ -0,0 +1,95 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.discriminators API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.discriminators</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from .mpd import MultiPeriodDiscriminator
+from .msd import MultiScaleDiscriminator
+from .msstftd import MultiScaleSTFTDiscriminator</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.adversarial.discriminators.base" href="base.html">audiocraft.adversarial.discriminators.base</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.adversarial.discriminators.mpd" href="mpd.html">audiocraft.adversarial.discriminators.mpd</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.adversarial.discriminators.msd" href="msd.html">audiocraft.adversarial.discriminators.msd</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.adversarial.discriminators.msstftd" href="msstftd.html">audiocraft.adversarial.discriminators.msstftd</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial" href="../index.html">audiocraft.adversarial</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators.base" href="base.html">audiocraft.adversarial.discriminators.base</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd" href="mpd.html">audiocraft.adversarial.discriminators.mpd</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd" href="msd.html">audiocraft.adversarial.discriminators.msd</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd" href="msstftd.html">audiocraft.adversarial.discriminators.msstftd</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/discriminators/mpd.html b/api_docs/audiocraft/adversarial/discriminators/mpd.html
new file mode 100644
index 00000000..806267f0
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/discriminators/mpd.html
@@ -0,0 +1,446 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.discriminators.mpd API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.discriminators.mpd</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+
+
+def get_padding(kernel_size: int, dilation: int = 1) -&gt; int:
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+class PeriodDiscriminator(nn.Module):
+    &#34;&#34;&#34;Period sub-discriminator.
+
+    Args:
+        period (int): Period between samples of audio.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_layers (int): Number of convolutional layers.
+        kernel_sizes (list of int): Kernel sizes for convolutions.
+        stride (int): Stride for convolutions.
+        filters (int): Initial number of filters in convolutions.
+        filters_scale (int): Multiplier of number of filters as we increase depth.
+        max_filters (int): Maximum number of filters.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+    &#34;&#34;&#34;
+    def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
+                 n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
+                 filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
+                 norm: str = &#39;weight_norm&#39;, activation: str = &#39;LeakyReLU&#39;,
+                 activation_params: dict = {&#39;negative_slope&#39;: 0.2}):
+        super().__init__()
+        self.period = period
+        self.n_layers = n_layers
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        in_chs = in_channels
+        for i in range(self.n_layers):
+            out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
+            eff_stride = 1 if i == self.n_layers - 1 else stride
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
+                                         padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
+            in_chs = out_chs
+        self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
+                                    padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), &#39;reflect&#39;)
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for conv in self.convs:
+            x = conv(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Period (MPD) Discriminator.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
+        **kwargs: Additional args for `PeriodDiscriminator`
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int = 1, out_channels: int = 1,
+                 periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
+        ])
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.mpd.get_padding"><code class="name flex">
+<span>def <span class="ident">get_padding</span></span>(<span>kernel_size: int, dilation: int = 1) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_padding(kernel_size: int, dilation: int = 1) -&gt; int:
+    return int((kernel_size * dilation - dilation) / 2)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator"><code class="flex name class">
+<span>class <span class="ident">MultiPeriodDiscriminator</span></span>
+<span>(</span><span>in_channels: int = 1, out_channels: int = 1, periods: Sequence[int] = [2, 3, 5, 7, 11], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Multi-Period (MPD) Discriminator.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>periods</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Periods between samples of audio for the sub-discriminators.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional args for <code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator">PeriodDiscriminator</a></code></dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiPeriodDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Period (MPD) Discriminator.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
+        **kwargs: Additional args for `PeriodDiscriminator`
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int = 1, out_channels: int = 1,
+                 periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
+        ])
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators">num_discriminators</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator"><code class="flex name class">
+<span>class <span class="ident">PeriodDiscriminator</span></span>
+<span>(</span><span>period: int, in_channels: int = 1, out_channels: int = 1, n_layers: int = 5, kernel_sizes: List[int] = [5, 3], stride: int = 3, filters: int = 8, filters_scale: int = 4, max_filters: int = 1024, norm: str = 'weight_norm', activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2})</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Period sub-discriminator.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>period</code></strong> :&ensp;<code>int</code></dt>
+<dd>Period between samples of audio.</dd>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>n_layers</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of convolutional layers.</dd>
+<dt><strong><code>kernel_sizes</code></strong> :&ensp;<code>list</code> of <code>int</code></dt>
+<dd>Kernel sizes for convolutions.</dd>
+<dt><strong><code>stride</code></strong> :&ensp;<code>int</code></dt>
+<dd>Stride for convolutions.</dd>
+<dt><strong><code>filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Initial number of filters in convolutions.</dd>
+<dt><strong><code>filters_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Multiplier of number of filters as we increase depth.</dd>
+<dt><strong><code>max_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum number of filters.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PeriodDiscriminator(nn.Module):
+    &#34;&#34;&#34;Period sub-discriminator.
+
+    Args:
+        period (int): Period between samples of audio.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_layers (int): Number of convolutional layers.
+        kernel_sizes (list of int): Kernel sizes for convolutions.
+        stride (int): Stride for convolutions.
+        filters (int): Initial number of filters in convolutions.
+        filters_scale (int): Multiplier of number of filters as we increase depth.
+        max_filters (int): Maximum number of filters.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+    &#34;&#34;&#34;
+    def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
+                 n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
+                 filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
+                 norm: str = &#39;weight_norm&#39;, activation: str = &#39;LeakyReLU&#39;,
+                 activation_params: dict = {&#39;negative_slope&#39;: 0.2}):
+        super().__init__()
+        self.period = period
+        self.n_layers = n_layers
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        in_chs = in_channels
+        for i in range(self.n_layers):
+            out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
+            eff_stride = 1 if i == self.n_layers - 1 else stride
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
+                                         padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
+            in_chs = out_chs
+        self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
+                                    padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), &#39;reflect&#39;)
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for conv in self.convs:
+            x = conv(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+
+        return x, fmap</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor):
+    fmap = []
+    # 1d to 2d
+    b, c, t = x.shape
+    if t % self.period != 0:  # pad first
+        n_pad = self.period - (t % self.period)
+        x = F.pad(x, (0, n_pad), &#39;reflect&#39;)
+        t = t + n_pad
+    x = x.view(b, c, t // self.period, self.period)
+
+    for conv in self.convs:
+        x = conv(x)
+        x = self.activation(x)
+        fmap.append(x)
+    x = self.conv_post(x)
+    fmap.append(x)
+    # x = torch.flatten(x, 1, -1)
+
+    return x, fmap</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators" href="index.html">audiocraft.adversarial.discriminators</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.get_padding" href="#audiocraft.adversarial.discriminators.mpd.get_padding">get_padding</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator" href="#audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator">MultiPeriodDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.training" href="#audiocraft.adversarial.discriminators.mpd.MultiPeriodDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator">PeriodDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.forward" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.training" href="#audiocraft.adversarial.discriminators.mpd.PeriodDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/discriminators/msd.html b/api_docs/audiocraft/adversarial/discriminators/msd.html
new file mode 100644
index 00000000..4294eacb
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/discriminators/msd.html
@@ -0,0 +1,468 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.discriminators.msd API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.discriminators.msd</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...modules import NormConv1d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+
+
+class ScaleDiscriminator(nn.Module):
+    &#34;&#34;&#34;Waveform sub-discriminator.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
+        filters (int): Number of initial filters for convolutions.
+        max_filters (int): Maximum number of filters.
+        downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
+        inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
+        groups (Sequence[int] or None): Groups for inner convolutions.
+        strides (Sequence[int] or None): Strides for inner convolutions.
+        paddings (Sequence[int] or None): Paddings for inner convolutions.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        pad (str): Padding for initial convolution.
+        pad_params (dict): Parameters to provide to the padding module.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
+                 filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
+                 inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
+                 strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
+                 norm: str = &#39;weight_norm&#39;, activation: str = &#39;LeakyReLU&#39;,
+                 activation_params: dict = {&#39;negative_slope&#39;: 0.2}, pad: str = &#39;ReflectionPad1d&#39;,
+                 pad_params: dict = {}):
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+        assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
+        assert (groups is None or len(groups) == len(downsample_scales))
+        assert (strides is None or len(strides) == len(downsample_scales))
+        assert (paddings is None or len(paddings) == len(downsample_scales))
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
+            )
+        )
+
+        in_chs = filters
+        for i, downsample_scale in enumerate(downsample_scales):
+            out_chs = min(in_chs * downsample_scale, max_filters)
+            default_kernel_size = downsample_scale * 10 + 1
+            default_stride = downsample_scale
+            default_padding = (default_kernel_size - 1) // 2
+            default_groups = in_chs // 4
+            self.convs.append(
+                NormConv1d(in_chs, out_chs,
+                           kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
+                           stride=strides[i] if strides else default_stride,
+                           groups=groups[i] if groups else default_groups,
+                           padding=paddings[i] if paddings else default_padding,
+                           norm=norm))
+            in_chs = out_chs
+
+        out_chs = min(in_chs * 2, max_filters)
+        self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
+                                     padding=(kernel_sizes[0] - 1) // 2, norm=norm))
+        self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
+                                    padding=(kernel_sizes[1] - 1) // 2, norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+
+
+class MultiScaleDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Scale (MSD) Discriminator,
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        downsample_factor (int): Downsampling factor between the different scales.
+        scale_norms (Sequence[str]): Normalization for each sub-discriminator.
+        **kwargs: Additional args for ScaleDiscriminator.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
+                 scale_norms: tp.Sequence[str] = [&#39;weight_norm&#39;, &#39;weight_norm&#39;, &#39;weight_norm&#39;], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
+        ])
+        self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for i, disc in enumerate(self.discriminators):
+            if i != 0:
+                self.downsample(x)
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator"><code class="flex name class">
+<span>class <span class="ident">MultiScaleDiscriminator</span></span>
+<span>(</span><span>in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2, scale_norms: Sequence[str] = ['weight_norm', 'weight_norm', 'weight_norm'], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Multi-Scale (MSD) Discriminator,</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>downsample_factor</code></strong> :&ensp;<code>int</code></dt>
+<dd>Downsampling factor between the different scales.</dd>
+<dt><strong><code>scale_norms</code></strong> :&ensp;<code>Sequence[str]</code></dt>
+<dd>Normalization for each sub-discriminator.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional args for ScaleDiscriminator.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiScaleDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Scale (MSD) Discriminator,
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        downsample_factor (int): Downsampling factor between the different scales.
+        scale_norms (Sequence[str]): Normalization for each sub-discriminator.
+        **kwargs: Additional args for ScaleDiscriminator.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
+                 scale_norms: tp.Sequence[str] = [&#39;weight_norm&#39;, &#39;weight_norm&#39;, &#39;weight_norm&#39;], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
+        ])
+        self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for i, disc in enumerate(self.discriminators):
+            if i != 0:
+                self.downsample(x)
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators">num_discriminators</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator"><code class="flex name class">
+<span>class <span class="ident">ScaleDiscriminator</span></span>
+<span>(</span><span>in_channels=1, out_channels=1, kernel_sizes: Sequence[int] = [5, 3], filters: int = 16, max_filters: int = 1024, downsample_scales: Sequence[int] = [4, 4, 4, 4], inner_kernel_sizes: Optional[Sequence[int]] = None, groups: Optional[Sequence[int]] = None, strides: Optional[Sequence[int]] = None, paddings: Optional[Sequence[int]] = None, norm: str = 'weight_norm', activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2}, pad: str = 'ReflectionPad1d', pad_params: dict = {})</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Waveform sub-discriminator.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>kernel_sizes</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Kernel sizes for first and last convolutions.</dd>
+<dt><strong><code>filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of initial filters for convolutions.</dd>
+<dt><strong><code>max_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum number of filters.</dd>
+<dt><strong><code>downsample_scales</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Scale for downsampling implemented as strided convolutions.</dd>
+<dt><strong><code>inner_kernel_sizes</code></strong> :&ensp;<code>Sequence[int]</code> or <code>None</code></dt>
+<dd>Kernel sizes for inner convolutions.</dd>
+<dt><strong><code>groups</code></strong> :&ensp;<code>Sequence[int]</code> or <code>None</code></dt>
+<dd>Groups for inner convolutions.</dd>
+<dt><strong><code>strides</code></strong> :&ensp;<code>Sequence[int]</code> or <code>None</code></dt>
+<dd>Strides for inner convolutions.</dd>
+<dt><strong><code>paddings</code></strong> :&ensp;<code>Sequence[int]</code> or <code>None</code></dt>
+<dd>Paddings for inner convolutions.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>pad</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding for initial convolution.</dd>
+<dt><strong><code>pad_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the padding module.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ScaleDiscriminator(nn.Module):
+    &#34;&#34;&#34;Waveform sub-discriminator.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
+        filters (int): Number of initial filters for convolutions.
+        max_filters (int): Maximum number of filters.
+        downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
+        inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
+        groups (Sequence[int] or None): Groups for inner convolutions.
+        strides (Sequence[int] or None): Strides for inner convolutions.
+        paddings (Sequence[int] or None): Paddings for inner convolutions.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        pad (str): Padding for initial convolution.
+        pad_params (dict): Parameters to provide to the padding module.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
+                 filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
+                 inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
+                 strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
+                 norm: str = &#39;weight_norm&#39;, activation: str = &#39;LeakyReLU&#39;,
+                 activation_params: dict = {&#39;negative_slope&#39;: 0.2}, pad: str = &#39;ReflectionPad1d&#39;,
+                 pad_params: dict = {}):
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+        assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
+        assert (groups is None or len(groups) == len(downsample_scales))
+        assert (strides is None or len(strides) == len(downsample_scales))
+        assert (paddings is None or len(paddings) == len(downsample_scales))
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
+            )
+        )
+
+        in_chs = filters
+        for i, downsample_scale in enumerate(downsample_scales):
+            out_chs = min(in_chs * downsample_scale, max_filters)
+            default_kernel_size = downsample_scale * 10 + 1
+            default_stride = downsample_scale
+            default_padding = (default_kernel_size - 1) // 2
+            default_groups = in_chs // 4
+            self.convs.append(
+                NormConv1d(in_chs, out_chs,
+                           kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
+                           stride=strides[i] if strides else default_stride,
+                           groups=groups[i] if groups else default_groups,
+                           padding=paddings[i] if paddings else default_padding,
+                           norm=norm))
+            in_chs = out_chs
+
+        out_chs = min(in_chs * 2, max_filters)
+        self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
+                                     padding=(kernel_sizes[0] - 1) // 2, norm=norm))
+        self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
+                                    padding=(kernel_sizes[1] - 1) // 2, norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor):
+    fmap = []
+    for layer in self.convs:
+        x = layer(x)
+        x = self.activation(x)
+        fmap.append(x)
+    x = self.conv_post(x)
+    fmap.append(x)
+    # x = torch.flatten(x, 1, -1)
+    return x, fmap</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators" href="index.html">audiocraft.adversarial.discriminators</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator" href="#audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator">MultiScaleDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.training" href="#audiocraft.adversarial.discriminators.msd.MultiScaleDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator" href="#audiocraft.adversarial.discriminators.msd.ScaleDiscriminator">ScaleDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.forward" href="#audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.training" href="#audiocraft.adversarial.discriminators.msd.ScaleDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/discriminators/msstftd.html b/api_docs/audiocraft/adversarial/discriminators/msstftd.html
new file mode 100644
index 00000000..0e396486
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/discriminators/msstftd.html
@@ -0,0 +1,505 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.discriminators.msstftd API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.discriminators.msstftd</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import torchaudio
+import torch
+from torch import nn
+from einops import rearrange
+
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+
+
+def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
+    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
+
+
+class DiscriminatorSTFT(nn.Module):
+    &#34;&#34;&#34;STFT sub-discriminator.
+
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_fft (int): Size of FFT for each scale.
+        hop_length (int): Length of hop between STFT windows for each scale.
+        kernel_size (tuple of int): Inner Conv2d kernel sizes.
+        stride (tuple of int): Inner Conv2d strides.
+        dilations (list of int): Inner Conv2d dilation on the time dimension.
+        win_length (int): Window size for each scale.
+        normalized (bool): Whether to normalize by magnitude after stft.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        growth (int): Growth factor for the filters.
+    &#34;&#34;&#34;
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
+                 n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
+                 filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
+                 stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = &#39;weight_norm&#39;,
+                 activation: str = &#39;LeakyReLU&#39;, activation_params: dict = {&#39;negative_slope&#39;: 0.2}):
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        self.filters = filters
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.spec_transform = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
+            normalized=self.normalized, center=False, pad_mode=None, power=None)
+        spec_channels = 2 * self.in_channels
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
+        )
+        in_chs = min(filters_scale * self.filters, max_filters)
+        for i, dilation in enumerate(dilations):
+            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
+                                         dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
+                                         norm=norm))
+            in_chs = out_chs
+        out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
+        self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
+                                     padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                     norm=norm))
+        self.conv_post = NormConv2d(out_chs, self.out_channels,
+                                    kernel_size=(kernel_size[0], kernel_size[0]),
+                                    padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                    norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
+        z = torch.cat([z.real, z.imag], dim=1)
+        z = rearrange(z, &#39;b c w t -&gt; b c t w&#39;)
+        for i, layer in enumerate(self.convs):
+            z = layer(z)
+            z = self.activation(z)
+            fmap.append(z)
+        z = self.conv_post(z)
+        return z, fmap
+
+
+class MultiScaleSTFTDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Scale STFT (MS-STFT) discriminator.
+
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        sep_channels (bool): Separate channels to distinct samples for stereo support.
+        n_ffts (Sequence[int]): Size of FFT for each scale.
+        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
+        win_lengths (Sequence[int]): Window size for each scale.
+        **kwargs: Additional args for STFTDiscriminator.
+    &#34;&#34;&#34;
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
+                 n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
+                 win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.sep_channels = sep_channels
+        self.discriminators = nn.ModuleList([
+            DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
+                              n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
+            for i in range(len(n_ffts))
+        ])
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def _separate_channels(self, x: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = x.shape
+        return x.view(-1, 1, T)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msstftd.get_2d_padding"><code class="name flex">
+<span>def <span class="ident">get_2d_padding</span></span>(<span>kernel_size: Tuple[int, int], dilation: Tuple[int, int] = (1, 1))</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
+    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT"><code class="flex name class">
+<span>class <span class="ident">DiscriminatorSTFT</span></span>
+<span>(</span><span>filters: int, in_channels: int = 1, out_channels: int = 1, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024, filters_scale: int = 1, kernel_size: Tuple[int, int] = (3, 9), dilations: List[~T] = [1, 2, 4], stride: Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = 'weight_norm', activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2})</span>
+</code></dt>
+<dd>
+<div class="desc"><p>STFT sub-discriminator.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of filters in convolutions.</dd>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Size of FFT for each scale.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Length of hop between STFT windows for each scale.</dd>
+<dt><strong><code>kernel_size</code></strong> :&ensp;<code>tuple</code> of <code>int</code></dt>
+<dd>Inner Conv2d kernel sizes.</dd>
+<dt><strong><code>stride</code></strong> :&ensp;<code>tuple</code> of <code>int</code></dt>
+<dd>Inner Conv2d strides.</dd>
+<dt><strong><code>dilations</code></strong> :&ensp;<code>list</code> of <code>int</code></dt>
+<dd>Inner Conv2d dilation on the time dimension.</dd>
+<dt><strong><code>win_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Window size for each scale.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize by magnitude after stft.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>growth</code></strong> :&ensp;<code>int</code></dt>
+<dd>Growth factor for the filters.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DiscriminatorSTFT(nn.Module):
+    &#34;&#34;&#34;STFT sub-discriminator.
+
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_fft (int): Size of FFT for each scale.
+        hop_length (int): Length of hop between STFT windows for each scale.
+        kernel_size (tuple of int): Inner Conv2d kernel sizes.
+        stride (tuple of int): Inner Conv2d strides.
+        dilations (list of int): Inner Conv2d dilation on the time dimension.
+        win_length (int): Window size for each scale.
+        normalized (bool): Whether to normalize by magnitude after stft.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        growth (int): Growth factor for the filters.
+    &#34;&#34;&#34;
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
+                 n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
+                 filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
+                 stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = &#39;weight_norm&#39;,
+                 activation: str = &#39;LeakyReLU&#39;, activation_params: dict = {&#39;negative_slope&#39;: 0.2}):
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        self.filters = filters
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.spec_transform = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
+            normalized=self.normalized, center=False, pad_mode=None, power=None)
+        spec_channels = 2 * self.in_channels
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
+        )
+        in_chs = min(filters_scale * self.filters, max_filters)
+        for i, dilation in enumerate(dilations):
+            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
+                                         dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
+                                         norm=norm))
+            in_chs = out_chs
+        out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
+        self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
+                                     padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                     norm=norm))
+        self.conv_post = NormConv2d(out_chs, self.out_channels,
+                                    kernel_size=(kernel_size[0], kernel_size[0]),
+                                    padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                    norm=norm)
+
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
+        z = torch.cat([z.real, z.imag], dim=1)
+        z = rearrange(z, &#39;b c w t -&gt; b c t w&#39;)
+        for i, layer in enumerate(self.convs):
+            z = layer(z)
+            z = self.activation(z)
+            fmap.append(z)
+        z = self.conv_post(z)
+        return z, fmap</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor):
+    fmap = []
+    z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
+    z = torch.cat([z.real, z.imag], dim=1)
+    z = rearrange(z, &#39;b c w t -&gt; b c t w&#39;)
+    for i, layer in enumerate(self.convs):
+        z = layer(z)
+        z = self.activation(z)
+        fmap.append(z)
+    z = self.conv_post(z)
+    return z, fmap</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator"><code class="flex name class">
+<span>class <span class="ident">MultiScaleSTFTDiscriminator</span></span>
+<span>(</span><span>filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False, n_ffts: List[int] = [1024, 2048, 512], hop_lengths: List[int] = [256, 512, 128], win_lengths: List[int] = [1024, 2048, 512], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Multi-Scale STFT (MS-STFT) discriminator.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of filters in convolutions.</dd>
+<dt><strong><code>in_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of input channels.</dd>
+<dt><strong><code>out_channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of output channels.</dd>
+<dt><strong><code>sep_channels</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Separate channels to distinct samples for stereo support.</dd>
+<dt><strong><code>n_ffts</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Size of FFT for each scale.</dd>
+<dt><strong><code>hop_lengths</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Length of hop between STFT windows for each scale.</dd>
+<dt><strong><code>win_lengths</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Window size for each scale.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional args for STFTDiscriminator.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiScaleSTFTDiscriminator(MultiDiscriminator):
+    &#34;&#34;&#34;Multi-Scale STFT (MS-STFT) discriminator.
+
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        sep_channels (bool): Separate channels to distinct samples for stereo support.
+        n_ffts (Sequence[int]): Size of FFT for each scale.
+        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
+        win_lengths (Sequence[int]): Window size for each scale.
+        **kwargs: Additional args for STFTDiscriminator.
+    &#34;&#34;&#34;
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
+                 n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
+                 win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.sep_channels = sep_channels
+        self.discriminators = nn.ModuleList([
+            DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
+                              n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
+            for i in range(len(n_ffts))
+        ])
+
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+
+    def _separate_channels(self, x: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = x.shape
+        return x.view(-1, 1, T)
+
+    def forward(self, x: torch.Tensor) -&gt; MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator">MultiDiscriminator</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators" href="base.html#audiocraft.adversarial.discriminators.base.MultiDiscriminator.num_discriminators">num_discriminators</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators" href="index.html">audiocraft.adversarial.discriminators</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.get_2d_padding" href="#audiocraft.adversarial.discriminators.msstftd.get_2d_padding">get_2d_padding</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT" href="#audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT">DiscriminatorSTFT</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.call_super_init" href="#audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.dump_patches" href="#audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.forward" href="#audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.training" href="#audiocraft.adversarial.discriminators.msstftd.DiscriminatorSTFT.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator" href="#audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator">MultiScaleSTFTDiscriminator</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.call_super_init" href="#audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.dump_patches" href="#audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.training" href="#audiocraft.adversarial.discriminators.msstftd.MultiScaleSTFTDiscriminator.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/index.html b/api_docs/audiocraft/adversarial/index.html
new file mode 100644
index 00000000..8d01cf1d
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/index.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial API documentation</title>
+<meta name="description" content="Adversarial losses and discriminator architectures." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial</code></h1>
+</header>
+<section id="section-intro">
+<p>Adversarial losses and discriminator architectures.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Adversarial losses and discriminator architectures.&#34;&#34;&#34;
+
+# flake8: noqa
+from .discriminators import (
+    MultiPeriodDiscriminator,
+    MultiScaleDiscriminator,
+    MultiScaleSTFTDiscriminator
+)
+from .losses import (
+    AdversarialLoss,
+    AdvLossType,
+    get_adv_criterion,
+    get_fake_criterion,
+    get_real_criterion,
+    FeatLossType,
+    FeatureMatchingLoss
+)</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.adversarial.discriminators" href="discriminators/index.html">audiocraft.adversarial.discriminators</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.adversarial.losses" href="losses.html">audiocraft.adversarial.losses</a></code></dt>
+<dd>
+<div class="desc"><p>Utility module to handle adversarial losses without requiring to mess up the main training loop.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.adversarial.discriminators" href="discriminators/index.html">audiocraft.adversarial.discriminators</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses" href="losses.html">audiocraft.adversarial.losses</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/adversarial/losses.html b/api_docs/audiocraft/adversarial/losses.html
new file mode 100644
index 00000000..7ad2d321
--- /dev/null
+++ b/api_docs/audiocraft/adversarial/losses.html
@@ -0,0 +1,855 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.adversarial.losses API documentation</title>
+<meta name="description" content="Utility module to handle adversarial losses without requiring to mess up the main training loop." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.adversarial.losses</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility module to handle adversarial losses without requiring to mess up the main training loop.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility module to handle adversarial losses without requiring to mess up the main training loop.
+&#34;&#34;&#34;
+
+import typing as tp
+
+import flashy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+ADVERSARIAL_LOSSES = [&#39;mse&#39;, &#39;hinge&#39;, &#39;hinge2&#39;]
+
+
+AdvLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor], torch.Tensor]]
+FeatLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
+
+
+class AdversarialLoss(nn.Module):
+    &#34;&#34;&#34;Adversary training wrapper.
+
+    Args:
+        adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
+            We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
+            where the first item is a list of logits and the second item is a list of feature maps.
+        optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
+        loss (AdvLossType): Loss function for generator training.
+        loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
+        loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
+        loss_feat (FeatLossType): Feature matching loss function for generator training.
+        normalize (bool): Whether to normalize by number of sub-discriminators.
+
+    Example of usage:
+        adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
+        for real in loader:
+            noise = torch.randn(...)
+            fake = model(noise)
+            adv_loss.train_adv(fake, real)
+            loss, _ = adv_loss(fake, real)
+            loss.backward()
+    &#34;&#34;&#34;
+    def __init__(self,
+                 adversary: nn.Module,
+                 optimizer: torch.optim.Optimizer,
+                 loss: AdvLossType,
+                 loss_real: AdvLossType,
+                 loss_fake: AdvLossType,
+                 loss_feat: tp.Optional[FeatLossType] = None,
+                 normalize: bool = True):
+        super().__init__()
+        self.adversary: nn.Module = adversary
+        flashy.distrib.broadcast_model(self.adversary)
+        self.optimizer = optimizer
+        self.loss = loss
+        self.loss_real = loss_real
+        self.loss_fake = loss_fake
+        self.loss_feat = loss_feat
+        self.normalize = normalize
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # Add the optimizer state dict inside our own.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + &#39;optimizer&#39;] = self.optimizer.state_dict()
+        return destination
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Load optimizer state.
+        self.optimizer.load_state_dict(state_dict.pop(prefix + &#39;optimizer&#39;))
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def get_adversary_pred(self, x):
+        &#34;&#34;&#34;Run adversary model, validating expected output format.&#34;&#34;&#34;
+        logits, fmaps = self.adversary(x)
+        assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
+            f&#39;Expecting a list of tensors as logits but {type(logits)} found.&#39;
+        assert isinstance(fmaps, list), f&#39;Expecting a list of features maps but {type(fmaps)} found.&#39;
+        for fmap in fmaps:
+            assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
+                f&#39;Expecting a list of tensors as feature maps but {type(fmap)} found.&#39;
+        return logits, fmaps
+
+    def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Train the adversary with the given fake and real example.
+
+        We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+        The first item being the logits and second item being a list of feature maps for each sub-discriminator.
+
+        This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
+        and call the optimizer.
+        &#34;&#34;&#34;
+        loss = torch.tensor(0., device=fake.device)
+        all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
+        all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
+        n_sub_adversaries = len(all_logits_fake_is_fake)
+        for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
+            loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
+
+        if self.normalize:
+            loss /= n_sub_adversaries
+
+        self.optimizer.zero_grad()
+        with flashy.distrib.eager_sync_model(self.adversary):
+            loss.backward()
+        self.optimizer.step()
+
+        return loss
+
+    def forward(self, fake: torch.Tensor, real: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Return the loss for the generator, i.e. trying to fool the adversary,
+        and feature matching loss if provided.
+        &#34;&#34;&#34;
+        adv = torch.tensor(0., device=fake.device)
+        feat = torch.tensor(0., device=fake.device)
+        with flashy.utils.readonly(self.adversary):
+            all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
+            all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
+            n_sub_adversaries = len(all_logits_fake_is_fake)
+            for logit_fake_is_fake in all_logits_fake_is_fake:
+                adv += self.loss(logit_fake_is_fake)
+            if self.loss_feat:
+                for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
+                    feat += self.loss_feat(fmap_fake, fmap_real)
+
+        if self.normalize:
+            adv /= n_sub_adversaries
+            feat /= n_sub_adversaries
+
+        return adv, feat
+
+
+def get_adv_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_loss
+    elif loss_type == &#39;hinge&#39;:
+        return hinge_loss
+    elif loss_type == &#39;hinge2&#39;:
+        return hinge2_loss
+    raise ValueError(&#39;Unsupported loss&#39;)
+
+
+def get_fake_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_fake_loss
+    elif loss_type in [&#39;hinge&#39;, &#39;hinge2&#39;]:
+        return hinge_fake_loss
+    raise ValueError(&#39;Unsupported loss&#39;)
+
+
+def get_real_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_real_loss
+    elif loss_type in [&#39;hinge&#39;, &#39;hinge2&#39;]:
+        return hinge_real_loss
+    raise ValueError(&#39;Unsupported loss&#39;)
+
+
+def mse_real_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+
+
+def mse_fake_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))
+
+
+def hinge_real_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+
+
+def hinge_fake_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+
+
+def mse_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+
+
+def hinge_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return -x.mean()
+
+
+def hinge2_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0])
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+
+
+class FeatureMatchingLoss(nn.Module):
+    &#34;&#34;&#34;Feature matching loss for adversarial training.
+
+    Args:
+        loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
+        normalize (bool): Whether to normalize the loss.
+            by number of feature maps.
+    &#34;&#34;&#34;
+    def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
+        super().__init__()
+        self.loss = loss
+        self.normalize = normalize
+
+    def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -&gt; torch.Tensor:
+        assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) &gt; 0
+        feat_loss = torch.tensor(0., device=fmap_fake[0].device)
+        feat_scale = torch.tensor(0., device=fmap_fake[0].device)
+        n_fmaps = 0
+        for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
+            assert feat_fake.shape == feat_real.shape
+            n_fmaps += 1
+            feat_loss += self.loss(feat_fake, feat_real)
+            feat_scale += torch.mean(torch.abs(feat_real))
+
+        if self.normalize:
+            feat_loss /= n_fmaps
+
+        return feat_loss</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.adversarial.losses.get_adv_criterion"><code class="name flex">
+<span>def <span class="ident">get_adv_criterion</span></span>(<span>loss_type: str) ‑> Callable</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_adv_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_loss
+    elif loss_type == &#39;hinge&#39;:
+        return hinge_loss
+    elif loss_type == &#39;hinge2&#39;:
+        return hinge2_loss
+    raise ValueError(&#39;Unsupported loss&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.get_fake_criterion"><code class="name flex">
+<span>def <span class="ident">get_fake_criterion</span></span>(<span>loss_type: str) ‑> Callable</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_fake_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_fake_loss
+    elif loss_type in [&#39;hinge&#39;, &#39;hinge2&#39;]:
+        return hinge_fake_loss
+    raise ValueError(&#39;Unsupported loss&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.get_real_criterion"><code class="name flex">
+<span>def <span class="ident">get_real_criterion</span></span>(<span>loss_type: str) ‑> Callable</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_real_criterion(loss_type: str) -&gt; tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == &#39;mse&#39;:
+        return mse_real_loss
+    elif loss_type in [&#39;hinge&#39;, &#39;hinge2&#39;]:
+        return hinge_real_loss
+    raise ValueError(&#39;Unsupported loss&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.hinge2_loss"><code class="name flex">
+<span>def <span class="ident">hinge2_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hinge2_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0])
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.hinge_fake_loss"><code class="name flex">
+<span>def <span class="ident">hinge_fake_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hinge_fake_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.hinge_loss"><code class="name flex">
+<span>def <span class="ident">hinge_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hinge_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return -x.mean()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.hinge_real_loss"><code class="name flex">
+<span>def <span class="ident">hinge_real_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hinge_real_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.mse_fake_loss"><code class="name flex">
+<span>def <span class="ident">mse_fake_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mse_fake_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.mse_loss"><code class="name flex">
+<span>def <span class="ident">mse_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mse_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.mse_real_loss"><code class="name flex">
+<span>def <span class="ident">mse_real_loss</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mse_real_loss(x: torch.Tensor) -&gt; torch.Tensor:
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss"><code class="flex name class">
+<span>class <span class="ident">AdversarialLoss</span></span>
+<span>(</span><span>adversary: torch.nn.modules.module.Module, optimizer: torch.optim.optimizer.Optimizer, loss: Union[torch.nn.modules.module.Module, Callable[[torch.Tensor], torch.Tensor]], loss_real: Union[torch.nn.modules.module.Module, Callable[[torch.Tensor], torch.Tensor]], loss_fake: Union[torch.nn.modules.module.Module, Callable[[torch.Tensor], torch.Tensor]], loss_feat: Union[torch.nn.modules.module.Module, Callable[[torch.Tensor, torch.Tensor], torch.Tensor], None] = None, normalize: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Adversary training wrapper.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>adversary</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>The adversary module will be used to estimate the logits given the fake and real samples.
+We assume here the adversary output is <code>Tuple[List[torch.Tensor], List[List[torch.Tensor]]]</code>
+where the first item is a list of logits and the second item is a list of feature maps.</dd>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>torch.optim.Optimizer</code></dt>
+<dd>Optimizer used for training the given module.</dd>
+<dt><strong><code>loss</code></strong> :&ensp;<code>AdvLossType</code></dt>
+<dd>Loss function for generator training.</dd>
+<dt><strong><code>loss_real</code></strong> :&ensp;<code>AdvLossType</code></dt>
+<dd>Loss function for adversarial training on logits from real samples.</dd>
+<dt><strong><code>loss_fake</code></strong> :&ensp;<code>AdvLossType</code></dt>
+<dd>Loss function for adversarial training on logits from fake samples.</dd>
+<dt><strong><code>loss_feat</code></strong> :&ensp;<code>FeatLossType</code></dt>
+<dd>Feature matching loss function for generator training.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize by number of sub-discriminators.</dd>
+</dl>
+<p>Example of usage:
+adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
+for real in loader:
+noise = torch.randn(&hellip;)
+fake = model(noise)
+adv_loss.train_adv(fake, real)
+loss, _ = adv_loss(fake, real)
+loss.backward()</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AdversarialLoss(nn.Module):
+    &#34;&#34;&#34;Adversary training wrapper.
+
+    Args:
+        adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
+            We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
+            where the first item is a list of logits and the second item is a list of feature maps.
+        optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
+        loss (AdvLossType): Loss function for generator training.
+        loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
+        loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
+        loss_feat (FeatLossType): Feature matching loss function for generator training.
+        normalize (bool): Whether to normalize by number of sub-discriminators.
+
+    Example of usage:
+        adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
+        for real in loader:
+            noise = torch.randn(...)
+            fake = model(noise)
+            adv_loss.train_adv(fake, real)
+            loss, _ = adv_loss(fake, real)
+            loss.backward()
+    &#34;&#34;&#34;
+    def __init__(self,
+                 adversary: nn.Module,
+                 optimizer: torch.optim.Optimizer,
+                 loss: AdvLossType,
+                 loss_real: AdvLossType,
+                 loss_fake: AdvLossType,
+                 loss_feat: tp.Optional[FeatLossType] = None,
+                 normalize: bool = True):
+        super().__init__()
+        self.adversary: nn.Module = adversary
+        flashy.distrib.broadcast_model(self.adversary)
+        self.optimizer = optimizer
+        self.loss = loss
+        self.loss_real = loss_real
+        self.loss_fake = loss_fake
+        self.loss_feat = loss_feat
+        self.normalize = normalize
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # Add the optimizer state dict inside our own.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + &#39;optimizer&#39;] = self.optimizer.state_dict()
+        return destination
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Load optimizer state.
+        self.optimizer.load_state_dict(state_dict.pop(prefix + &#39;optimizer&#39;))
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def get_adversary_pred(self, x):
+        &#34;&#34;&#34;Run adversary model, validating expected output format.&#34;&#34;&#34;
+        logits, fmaps = self.adversary(x)
+        assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
+            f&#39;Expecting a list of tensors as logits but {type(logits)} found.&#39;
+        assert isinstance(fmaps, list), f&#39;Expecting a list of features maps but {type(fmaps)} found.&#39;
+        for fmap in fmaps:
+            assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
+                f&#39;Expecting a list of tensors as feature maps but {type(fmap)} found.&#39;
+        return logits, fmaps
+
+    def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Train the adversary with the given fake and real example.
+
+        We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+        The first item being the logits and second item being a list of feature maps for each sub-discriminator.
+
+        This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
+        and call the optimizer.
+        &#34;&#34;&#34;
+        loss = torch.tensor(0., device=fake.device)
+        all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
+        all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
+        n_sub_adversaries = len(all_logits_fake_is_fake)
+        for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
+            loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
+
+        if self.normalize:
+            loss /= n_sub_adversaries
+
+        self.optimizer.zero_grad()
+        with flashy.distrib.eager_sync_model(self.adversary):
+            loss.backward()
+        self.optimizer.step()
+
+        return loss
+
+    def forward(self, fake: torch.Tensor, real: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Return the loss for the generator, i.e. trying to fool the adversary,
+        and feature matching loss if provided.
+        &#34;&#34;&#34;
+        adv = torch.tensor(0., device=fake.device)
+        feat = torch.tensor(0., device=fake.device)
+        with flashy.utils.readonly(self.adversary):
+            all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
+            all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
+            n_sub_adversaries = len(all_logits_fake_is_fake)
+            for logit_fake_is_fake in all_logits_fake_is_fake:
+                adv += self.loss(logit_fake_is_fake)
+            if self.loss_feat:
+                for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
+                    feat += self.loss_feat(fmap_fake, fmap_real)
+
+        if self.normalize:
+            adv /= n_sub_adversaries
+            feat /= n_sub_adversaries
+
+        return adv, feat</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, fake: torch.Tensor, real: torch.Tensor) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return the loss for the generator, i.e. trying to fool the adversary,
+and feature matching loss if provided.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, fake: torch.Tensor, real: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Return the loss for the generator, i.e. trying to fool the adversary,
+    and feature matching loss if provided.
+    &#34;&#34;&#34;
+    adv = torch.tensor(0., device=fake.device)
+    feat = torch.tensor(0., device=fake.device)
+    with flashy.utils.readonly(self.adversary):
+        all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
+        all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
+        n_sub_adversaries = len(all_logits_fake_is_fake)
+        for logit_fake_is_fake in all_logits_fake_is_fake:
+            adv += self.loss(logit_fake_is_fake)
+        if self.loss_feat:
+            for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
+                feat += self.loss_feat(fmap_fake, fmap_real)
+
+    if self.normalize:
+        adv /= n_sub_adversaries
+        feat /= n_sub_adversaries
+
+    return adv, feat</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.get_adversary_pred"><code class="name flex">
+<span>def <span class="ident">get_adversary_pred</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Run adversary model, validating expected output format.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_adversary_pred(self, x):
+    &#34;&#34;&#34;Run adversary model, validating expected output format.&#34;&#34;&#34;
+    logits, fmaps = self.adversary(x)
+    assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
+        f&#39;Expecting a list of tensors as logits but {type(logits)} found.&#39;
+    assert isinstance(fmaps, list), f&#39;Expecting a list of features maps but {type(fmaps)} found.&#39;
+    for fmap in fmaps:
+        assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
+            f&#39;Expecting a list of tensors as feature maps but {type(fmap)} found.&#39;
+    return logits, fmaps</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.adversarial.losses.AdversarialLoss.train_adv"><code class="name flex">
+<span>def <span class="ident">train_adv</span></span>(<span>self, fake: torch.Tensor, real: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Train the adversary with the given fake and real example.</p>
+<p>We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+The first item being the logits and second item being a list of feature maps for each sub-discriminator.</p>
+<p>This will automatically synchronize gradients (with <code>flashy.distrib.eager_sync_model</code>)
+and call the optimizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Train the adversary with the given fake and real example.
+
+    We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+    The first item being the logits and second item being a list of feature maps for each sub-discriminator.
+
+    This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
+    and call the optimizer.
+    &#34;&#34;&#34;
+    loss = torch.tensor(0., device=fake.device)
+    all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
+    all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
+    n_sub_adversaries = len(all_logits_fake_is_fake)
+    for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
+        loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
+
+    if self.normalize:
+        loss /= n_sub_adversaries
+
+    self.optimizer.zero_grad()
+    with flashy.distrib.eager_sync_model(self.adversary):
+        loss.backward()
+    self.optimizer.step()
+
+    return loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.adversarial.losses.FeatureMatchingLoss"><code class="flex name class">
+<span>class <span class="ident">FeatureMatchingLoss</span></span>
+<span>(</span><span>loss: torch.nn.modules.module.Module = L1Loss(), normalize: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Feature matching loss for adversarial training.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>loss</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Loss to use for feature matching (default=torch.nn.L1).</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize the loss.
+by number of feature maps.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class FeatureMatchingLoss(nn.Module):
+    &#34;&#34;&#34;Feature matching loss for adversarial training.
+
+    Args:
+        loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
+        normalize (bool): Whether to normalize the loss.
+            by number of feature maps.
+    &#34;&#34;&#34;
+    def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
+        super().__init__()
+        self.loss = loss
+        self.normalize = normalize
+
+    def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -&gt; torch.Tensor:
+        assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) &gt; 0
+        feat_loss = torch.tensor(0., device=fmap_fake[0].device)
+        feat_scale = torch.tensor(0., device=fmap_fake[0].device)
+        n_fmaps = 0
+        for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
+            assert feat_fake.shape == feat_real.shape
+            n_fmaps += 1
+            feat_loss += self.loss(feat_fake, feat_real)
+            feat_scale += torch.mean(torch.abs(feat_real))
+
+        if self.normalize:
+            feat_loss /= n_fmaps
+
+        return feat_loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.adversarial.losses.FeatureMatchingLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.losses.FeatureMatchingLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.adversarial.losses.FeatureMatchingLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.adversarial.losses.FeatureMatchingLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, fmap_fake: List[torch.Tensor], fmap_real: List[torch.Tensor]) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -&gt; torch.Tensor:
+    assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) &gt; 0
+    feat_loss = torch.tensor(0., device=fmap_fake[0].device)
+    feat_scale = torch.tensor(0., device=fmap_fake[0].device)
+    n_fmaps = 0
+    for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
+        assert feat_fake.shape == feat_real.shape
+        n_fmaps += 1
+        feat_loss += self.loss(feat_fake, feat_real)
+        feat_scale += torch.mean(torch.abs(feat_real))
+
+    if self.normalize:
+        feat_loss /= n_fmaps
+
+    return feat_loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.adversarial" href="index.html">audiocraft.adversarial</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="two-column">
+<li><code><a title="audiocraft.adversarial.losses.get_adv_criterion" href="#audiocraft.adversarial.losses.get_adv_criterion">get_adv_criterion</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.get_fake_criterion" href="#audiocraft.adversarial.losses.get_fake_criterion">get_fake_criterion</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.get_real_criterion" href="#audiocraft.adversarial.losses.get_real_criterion">get_real_criterion</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.hinge2_loss" href="#audiocraft.adversarial.losses.hinge2_loss">hinge2_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.hinge_fake_loss" href="#audiocraft.adversarial.losses.hinge_fake_loss">hinge_fake_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.hinge_loss" href="#audiocraft.adversarial.losses.hinge_loss">hinge_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.hinge_real_loss" href="#audiocraft.adversarial.losses.hinge_real_loss">hinge_real_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.mse_fake_loss" href="#audiocraft.adversarial.losses.mse_fake_loss">mse_fake_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.mse_loss" href="#audiocraft.adversarial.losses.mse_loss">mse_loss</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.mse_real_loss" href="#audiocraft.adversarial.losses.mse_real_loss">mse_real_loss</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.adversarial.losses.AdversarialLoss" href="#audiocraft.adversarial.losses.AdversarialLoss">AdversarialLoss</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.call_super_init" href="#audiocraft.adversarial.losses.AdversarialLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.dump_patches" href="#audiocraft.adversarial.losses.AdversarialLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.forward" href="#audiocraft.adversarial.losses.AdversarialLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.get_adversary_pred" href="#audiocraft.adversarial.losses.AdversarialLoss.get_adversary_pred">get_adversary_pred</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.train_adv" href="#audiocraft.adversarial.losses.AdversarialLoss.train_adv">train_adv</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.AdversarialLoss.training" href="#audiocraft.adversarial.losses.AdversarialLoss.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.adversarial.losses.FeatureMatchingLoss" href="#audiocraft.adversarial.losses.FeatureMatchingLoss">FeatureMatchingLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.adversarial.losses.FeatureMatchingLoss.call_super_init" href="#audiocraft.adversarial.losses.FeatureMatchingLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.FeatureMatchingLoss.dump_patches" href="#audiocraft.adversarial.losses.FeatureMatchingLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.FeatureMatchingLoss.forward" href="#audiocraft.adversarial.losses.FeatureMatchingLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.adversarial.losses.FeatureMatchingLoss.training" href="#audiocraft.adversarial.losses.FeatureMatchingLoss.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/audio.html b/api_docs/audiocraft/data/audio.html
new file mode 100644
index 00000000..3971c9f4
--- /dev/null
+++ b/api_docs/audiocraft/data/audio.html
@@ -0,0 +1,548 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio API documentation</title>
+<meta name="description" content="Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio</code></h1>
+</header>
+<section id="section-intro">
+<p>Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass
+from pathlib import Path
+import logging
+import typing as tp
+
+import numpy as np
+import soundfile
+import torch
+from torch.nn import functional as F
+
+import av
+import subprocess as sp
+
+from .audio_utils import f32_pcm, normalize_audio
+
+
+_av_initialized = False
+
+
+def _init_av():
+    global _av_initialized
+    if _av_initialized:
+        return
+    logger = logging.getLogger(&#39;libav.mp3&#39;)
+    logger.setLevel(logging.ERROR)
+    _av_initialized = True
+
+
+@dataclass(frozen=True)
+class AudioFileInfo:
+    sample_rate: int
+    duration: float
+    channels: int
+
+
+def _av_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sample_rate = stream.codec_context.sample_rate
+        duration = float(stream.duration * stream.time_base)
+        channels = stream.channels
+        return AudioFileInfo(sample_rate, duration, channels)
+
+
+def _soundfile_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    info = soundfile.info(filepath)
+    return AudioFileInfo(info.samplerate, info.duration, info.channels)
+
+
+def audio_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    # torchaudio no longer returns useful duration informations for some formats like mp3s.
+    filepath = Path(filepath)
+    if filepath.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: Validate .ogg can be safely read with av_info
+        # ffmpeg has some weird issue with flac.
+        return _soundfile_info(filepath)
+    else:
+        return _av_info(filepath)
+
+
+def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;FFMPEG-based audio file reading using PyAV bindings.
+    Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
+    &#34;&#34;&#34;
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sr = stream.codec_context.sample_rate
+        num_frames = int(sr * duration) if duration &gt;= 0 else -1
+        frame_offset = int(sr * seek_time)
+        # we need a small negative offset otherwise we get some edge artifact
+        # from the mp3 decoder.
+        af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
+        frames = []
+        length = 0
+        for frame in af.decode(streams=stream.index):
+            current_offset = int(frame.rate * frame.pts * frame.time_base)
+            strip = max(0, frame_offset - current_offset)
+            buf = torch.from_numpy(frame.to_ndarray())
+            if buf.shape[0] != stream.channels:
+                buf = buf.view(-1, stream.channels).t()
+            buf = buf[:, strip:]
+            frames.append(buf)
+            length += buf.shape[1]
+            if num_frames &gt; 0 and length &gt;= num_frames:
+                break
+        assert frames
+        # If the above assert fails, it is likely because we seeked past the end of file point,
+        # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
+        # This will need proper debugging, in due time.
+        wav = torch.cat(frames, dim=1)
+        assert wav.shape[0] == stream.channels
+        if num_frames &gt; 0:
+            wav = wav[:, :num_frames]
+        return f32_pcm(wav), sr
+
+
+def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = False) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;Read audio by picking the most appropriate backend tool based on the audio format.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
+    &#34;&#34;&#34;
+    fp = Path(filepath)
+    if fp.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = _soundfile_info(filepath)
+        frames = -1 if duration &lt;= 0 else int(duration * info.sample_rate)
+        frame_offset = int(seek_time * info.sample_rate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.sample_rate == sr, f&#34;Mismatch of sample rates {info.sample_rate} {sr}&#34;
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration &gt; 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
+    return wav, sr
+
+
+def _piping_to_ffmpeg(out_path: tp.Union[str, Path], wav: torch.Tensor, sample_rate: int, flags: tp.List[str]):
+    # ffmpeg is always installed and torchaudio is a bit unstable lately, so let&#39;s bypass it entirely.
+    assert wav.dim() == 2, wav.shape
+    command = [
+        &#39;ffmpeg&#39;,
+        &#39;-loglevel&#39;, &#39;error&#39;,
+        &#39;-y&#39;, &#39;-f&#39;, &#39;f32le&#39;, &#39;-ar&#39;, str(sample_rate), &#39;-ac&#39;, str(wav.shape[0]),
+        &#39;-i&#39;, &#39;-&#39;] + flags + [str(out_path)]
+    input_ = f32_pcm(wav).t().detach().cpu().numpy().tobytes()
+    sp.run(command, input=input_, check=True)
+
+
+def audio_write(stem_name: tp.Union[str, Path],
+                wav: torch.Tensor, sample_rate: int,
+                format: str = &#39;wav&#39;, mp3_rate: int = 320, ogg_rate: tp.Optional[int] = None,
+                normalize: bool = True, strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
+                log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True) -&gt; Path:
+    &#34;&#34;&#34;Convenience function for saving audio to disk. Returns the filename the audio was written to.
+
+    Args:
+        stem_name (str or Path): Filename without extension which will be added automatically.
+        wav (torch.Tensor): Audio data to save.
+        sample_rate (int): Sample rate of audio data.
+        format (str): Either &#34;wav&#34;, &#34;mp3&#34;, &#34;ogg&#34;, or &#34;flac&#34;.
+        mp3_rate (int): kbps when using mp3s.
+        ogg_rate (int): kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is &#39;loudness&#39;.
+         when strategy is &#39;loudness&#39; log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        make_parent_dir (bool): Make parent directory if it doesn&#39;t exist.
+    Returns:
+        Path: Path of the saved audio.
+    &#34;&#34;&#34;
+    assert wav.dtype.is_floating_point, &#34;wav is not floating point&#34;
+    if wav.dim() == 1:
+        wav = wav[None]
+    elif wav.dim() &gt; 2:
+        raise ValueError(&#34;Input wav should be at most 2 dimension.&#34;)
+    assert wav.isfinite().all()
+    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
+                          log_clipping=log_clipping, sample_rate=sample_rate,
+                          stem_name=str(stem_name))
+    if format == &#39;mp3&#39;:
+        suffix = &#39;.mp3&#39;
+        flags = [&#39;-f&#39;, &#39;mp3&#39;, &#39;-c:a&#39;, &#39;libmp3lame&#39;, &#39;-b:a&#39;, f&#39;{mp3_rate}k&#39;]
+    elif format == &#39;wav&#39;:
+        suffix = &#39;.wav&#39;
+        flags = [&#39;-f&#39;, &#39;wav&#39;, &#39;-c:a&#39;, &#39;pcm_s16le&#39;]
+    elif format == &#39;ogg&#39;:
+        suffix = &#39;.ogg&#39;
+        flags = [&#39;-f&#39;, &#39;ogg&#39;, &#39;-c:a&#39;, &#39;libvorbis&#39;]
+        if ogg_rate is not None:
+            flags += [&#39;-b:a&#39;, f&#39;{ogg_rate}k&#39;]
+    elif format == &#39;flac&#39;:
+        suffix = &#39;.flac&#39;
+        flags = [&#39;-f&#39;, &#39;flac&#39;]
+    else:
+        raise RuntimeError(f&#34;Invalid format {format}. Only wav or mp3 are supported.&#34;)
+    if not add_suffix:
+        suffix = &#39;&#39;
+    path = Path(str(stem_name) + suffix)
+    if make_parent_dir:
+        path.parent.mkdir(exist_ok=True, parents=True)
+    try:
+        _piping_to_ffmpeg(path, wav, sample_rate, flags)
+    except Exception:
+        if path.exists():
+            # we do not want to leave half written files around.
+            path.unlink()
+        raise
+    return path</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio.audio_info"><code class="name flex">
+<span>def <span class="ident">audio_info</span></span>(<span>filepath: Union[str, pathlib.Path]) ‑> <a title="audiocraft.data.audio.AudioFileInfo" href="#audiocraft.data.audio.AudioFileInfo">AudioFileInfo</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    # torchaudio no longer returns useful duration informations for some formats like mp3s.
+    filepath = Path(filepath)
+    if filepath.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: Validate .ogg can be safely read with av_info
+        # ffmpeg has some weird issue with flac.
+        return _soundfile_info(filepath)
+    else:
+        return _av_info(filepath)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio.audio_read"><code class="name flex">
+<span>def <span class="ident">audio_read</span></span>(<span>filepath: Union[str, pathlib.Path], seek_time: float = 0.0, duration: float = -1.0, pad: bool = False) ‑> Tuple[torch.Tensor, int]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Read audio by picking the most appropriate backend tool based on the audio format.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>filepath</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to audio file to read.</dd>
+<dt><strong><code>seek_time</code></strong> :&ensp;<code>float</code></dt>
+<dd>Time at which to start reading in the file.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code></dt>
+<dd>Duration to read from the file. If set to -1, the whole file is read.</dd>
+<dt><strong><code>pad</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Pad output audio if not reaching expected duration.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tuple</code> of <code>torch.Tensor, int</code></dt>
+<dd>Tuple containing audio data and sample rate.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = False) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;Read audio by picking the most appropriate backend tool based on the audio format.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
+    &#34;&#34;&#34;
+    fp = Path(filepath)
+    if fp.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = _soundfile_info(filepath)
+        frames = -1 if duration &lt;= 0 else int(duration * info.sample_rate)
+        frame_offset = int(seek_time * info.sample_rate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.sample_rate == sr, f&#34;Mismatch of sample rates {info.sample_rate} {sr}&#34;
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration &gt; 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
+    return wav, sr</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio.audio_write"><code class="name flex">
+<span>def <span class="ident">audio_write</span></span>(<span>stem_name: Union[str, pathlib.Path], wav: torch.Tensor, sample_rate: int, format: str = 'wav', mp3_rate: int = 320, ogg_rate: Optional[int] = None, normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1, rms_headroom_db: float = 18, loudness_headroom_db: float = 14, loudness_compressor: bool = False, log_clipping: bool = True, make_parent_dir: bool = True, add_suffix: bool = True) ‑> pathlib.Path</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function for saving audio to disk. Returns the filename the audio was written to.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>stem_name</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Filename without extension which will be added automatically.</dd>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio data to save.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate of audio data.</dd>
+<dt><strong><code>format</code></strong> :&ensp;<code>str</code></dt>
+<dd>Either "wav", "mp3", "ogg", or "flac".</dd>
+<dt><strong><code>mp3_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>kbps when using mp3s.</dd>
+<dt><strong><code>ogg_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code> (default), normalizes according to the prescribed
+strategy (see after). If <code>False</code>, the strategy is only used in case clipping
+would happen.</dd>
+<dt><strong><code>strategy</code></strong> :&ensp;<code>str</code></dt>
+<dd>Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+with extra headroom to avoid clipping. 'clip' just clips.</dd>
+<dt><strong><code>peak_clip_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'peak' or 'clip' strategy.</dd>
+<dt><strong><code>rms_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'rms' strategy. This must be much larger
+than the <code>peak_clip</code> one to avoid further clipping.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness for loudness normalization.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Uses tanh for soft clipping when strategy is 'loudness'.</dd>
+<dt>when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still</dt>
+<dt>occurs despite strategy (only for 'rms').</dt>
+<dt><strong><code>make_parent_dir</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Make parent directory if it doesn't exist.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>Path</code></dt>
+<dd>Path of the saved audio.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_write(stem_name: tp.Union[str, Path],
+                wav: torch.Tensor, sample_rate: int,
+                format: str = &#39;wav&#39;, mp3_rate: int = 320, ogg_rate: tp.Optional[int] = None,
+                normalize: bool = True, strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
+                log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True) -&gt; Path:
+    &#34;&#34;&#34;Convenience function for saving audio to disk. Returns the filename the audio was written to.
+
+    Args:
+        stem_name (str or Path): Filename without extension which will be added automatically.
+        wav (torch.Tensor): Audio data to save.
+        sample_rate (int): Sample rate of audio data.
+        format (str): Either &#34;wav&#34;, &#34;mp3&#34;, &#34;ogg&#34;, or &#34;flac&#34;.
+        mp3_rate (int): kbps when using mp3s.
+        ogg_rate (int): kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is &#39;loudness&#39;.
+         when strategy is &#39;loudness&#39; log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        make_parent_dir (bool): Make parent directory if it doesn&#39;t exist.
+    Returns:
+        Path: Path of the saved audio.
+    &#34;&#34;&#34;
+    assert wav.dtype.is_floating_point, &#34;wav is not floating point&#34;
+    if wav.dim() == 1:
+        wav = wav[None]
+    elif wav.dim() &gt; 2:
+        raise ValueError(&#34;Input wav should be at most 2 dimension.&#34;)
+    assert wav.isfinite().all()
+    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
+                          log_clipping=log_clipping, sample_rate=sample_rate,
+                          stem_name=str(stem_name))
+    if format == &#39;mp3&#39;:
+        suffix = &#39;.mp3&#39;
+        flags = [&#39;-f&#39;, &#39;mp3&#39;, &#39;-c:a&#39;, &#39;libmp3lame&#39;, &#39;-b:a&#39;, f&#39;{mp3_rate}k&#39;]
+    elif format == &#39;wav&#39;:
+        suffix = &#39;.wav&#39;
+        flags = [&#39;-f&#39;, &#39;wav&#39;, &#39;-c:a&#39;, &#39;pcm_s16le&#39;]
+    elif format == &#39;ogg&#39;:
+        suffix = &#39;.ogg&#39;
+        flags = [&#39;-f&#39;, &#39;ogg&#39;, &#39;-c:a&#39;, &#39;libvorbis&#39;]
+        if ogg_rate is not None:
+            flags += [&#39;-b:a&#39;, f&#39;{ogg_rate}k&#39;]
+    elif format == &#39;flac&#39;:
+        suffix = &#39;.flac&#39;
+        flags = [&#39;-f&#39;, &#39;flac&#39;]
+    else:
+        raise RuntimeError(f&#34;Invalid format {format}. Only wav or mp3 are supported.&#34;)
+    if not add_suffix:
+        suffix = &#39;&#39;
+    path = Path(str(stem_name) + suffix)
+    if make_parent_dir:
+        path.parent.mkdir(exist_ok=True, parents=True)
+    try:
+        _piping_to_ffmpeg(path, wav, sample_rate, flags)
+    except Exception:
+        if path.exists():
+            # we do not want to leave half written files around.
+            path.unlink()
+        raise
+    return path</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.audio.AudioFileInfo"><code class="flex name class">
+<span>class <span class="ident">AudioFileInfo</span></span>
+<span>(</span><span>sample_rate: int, duration: float, channels: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioFileInfo(sample_rate: int, duration: float, channels: int)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioFileInfo:
+    sample_rate: int
+    duration: float
+    channels: int</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio.AudioFileInfo.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio.AudioFileInfo.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio.AudioFileInfo.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio.audio_info" href="#audiocraft.data.audio.audio_info">audio_info</a></code></li>
+<li><code><a title="audiocraft.data.audio.audio_read" href="#audiocraft.data.audio.audio_read">audio_read</a></code></li>
+<li><code><a title="audiocraft.data.audio.audio_write" href="#audiocraft.data.audio.audio_write">audio_write</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.audio.AudioFileInfo" href="#audiocraft.data.audio.AudioFileInfo">AudioFileInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.channels" href="#audiocraft.data.audio.AudioFileInfo.channels">channels</a></code></li>
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.duration" href="#audiocraft.data.audio.AudioFileInfo.duration">duration</a></code></li>
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.sample_rate" href="#audiocraft.data.audio.AudioFileInfo.sample_rate">sample_rate</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/audio_dataset.html b/api_docs/audiocraft/data/audio_dataset.html
new file mode 100644
index 00000000..ee907984
--- /dev/null
+++ b/api_docs/audiocraft/data/audio_dataset.html
@@ -0,0 +1,1715 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio_dataset API documentation</title>
+<meta name="description" content="AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio_dataset</code></h1>
+</header>
+<section id="section-intro">
+<p>AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, sample rate, duration), and use that to efficiently sample audio segments.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, sample rate, duration), and use that to efficiently sample audio segments.
+&#34;&#34;&#34;
+import argparse
+import copy
+from concurrent.futures import ThreadPoolExecutor, Future
+from dataclasses import dataclass, fields
+from contextlib import ExitStack
+from functools import lru_cache
+import gzip
+import json
+import logging
+import os
+from pathlib import Path
+import random
+import sys
+import typing as tp
+
+import torch
+import torch.nn.functional as F
+
+from .audio import audio_read, audio_info
+from .audio_utils import convert_audio
+from .zip import PathInZip
+
+try:
+    import dora
+except ImportError:
+    dora = None  # type: ignore
+
+
+@dataclass(order=True)
+class BaseInfo:
+
+    @classmethod
+    def _dict2fields(cls, dictionary: dict):
+        return {
+            field.name: dictionary[field.name]
+            for field in fields(cls) if field.name in dictionary
+        }
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        _dictionary = cls._dict2fields(dictionary)
+        return cls(**_dictionary)
+
+    def to_dict(self):
+        return {
+            field.name: self.__getattribute__(field.name)
+            for field in fields(self)
+            }
+
+
+@dataclass(order=True)
+class AudioMeta(BaseInfo):
+    path: str
+    duration: float
+    sample_rate: int
+    amplitude: tp.Optional[float] = None
+    weight: tp.Optional[float] = None
+    # info_path is used to load additional information about the audio file that is stored in zip files.
+    info_path: tp.Optional[PathInZip] = None
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        base = cls._dict2fields(dictionary)
+        if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+            base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+        return cls(**base)
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d[&#39;info_path&#39;] is not None:
+            d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+        return d
+
+
+@dataclass(order=True)
+class SegmentInfo(BaseInfo):
+    meta: AudioMeta
+    seek_time: float
+    # The following values are given once the audio is processed, e.g.
+    # at the target sample rate and target number of channels.
+    n_frames: int      # actual number of frames without padding
+    total_frames: int  # total number of frames, padding included
+    sample_rate: int   # actual sample rate
+    channels: int      # number of audio channels.
+
+
+DEFAULT_EXTS = [&#39;.wav&#39;, &#39;.mp3&#39;, &#39;.flac&#39;, &#39;.ogg&#39;, &#39;.m4a&#39;]
+
+logger = logging.getLogger(__name__)
+
+
+def _get_audio_meta(file_path: str, minimal: bool = True) -&gt; AudioMeta:
+    &#34;&#34;&#34;AudioMeta from a path to an audio file.
+
+    Args:
+        file_path (str): Resolved path of valid audio file.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+    Returns:
+        AudioMeta: Audio file path and its metadata.
+    &#34;&#34;&#34;
+    info = audio_info(file_path)
+    amplitude: tp.Optional[float] = None
+    if not minimal:
+        wav, sr = audio_read(file_path)
+        amplitude = wav.abs().max().item()
+    return AudioMeta(file_path, info.duration, info.sample_rate, amplitude)
+
+
+def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -&gt; AudioMeta:
+    &#34;&#34;&#34;If Dora is available as a dependency, try to resolve potential relative paths
+    in list of AudioMeta. This method is expected to be used when loading meta from file.
+
+    Args:
+        m (AudioMeta): Audio meta to resolve.
+        fast (bool): If True, uses a really fast check for determining if a file
+            is already absolute or not. Only valid on Linux/Mac.
+    Returns:
+        AudioMeta: Audio meta with resolved path.
+    &#34;&#34;&#34;
+    def is_abs(m):
+        if fast:
+            return str(m)[0] == &#39;/&#39;
+        else:
+            os.path.isabs(str(m))
+
+    if not dora:
+        return m
+
+    if not is_abs(m.path):
+        m.path = dora.git_save.to_absolute_path(m.path)
+    if m.info_path is not None and not is_abs(m.info_path.zip_path):
+        m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
+    return m
+
+
+def find_audio_files(path: tp.Union[Path, str],
+                     exts: tp.List[str] = DEFAULT_EXTS,
+                     resolve: bool = True,
+                     minimal: bool = True,
+                     progress: bool = False,
+                     workers: int = 0) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Build a list of AudioMeta from a given path,
+    collecting relevant audio files and fetching meta info.
+
+    Args:
+        path (str or Path): Path to folder containing audio files.
+        exts (list of str): List of file extensions to consider for audio files.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+        progress (bool): Whether to log progress on audio files collection.
+        workers (int): number of parallel workers, if 0, use only the current thread.
+    Returns:
+        list of AudioMeta: List of audio file path and its metadata.
+    &#34;&#34;&#34;
+    audio_files = []
+    futures: tp.List[Future] = []
+    pool: tp.Optional[ThreadPoolExecutor] = None
+    with ExitStack() as stack:
+        if workers &gt; 0:
+            pool = ThreadPoolExecutor(workers)
+            stack.enter_context(pool)
+
+        if progress:
+            print(&#34;Finding audio files...&#34;)
+        for root, folders, files in os.walk(path, followlinks=True):
+            for file in files:
+                full_path = Path(root) / file
+                if full_path.suffix.lower() in exts:
+                    audio_files.append(full_path)
+                    if pool is not None:
+                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
+                    if progress:
+                        print(format(len(audio_files), &#34; 8d&#34;), end=&#39;\r&#39;, file=sys.stderr)
+
+        if progress:
+            print(&#34;Getting audio metadata...&#34;)
+        meta: tp.List[AudioMeta] = []
+        for idx, file_path in enumerate(audio_files):
+            try:
+                if pool is None:
+                    m = _get_audio_meta(str(file_path), minimal)
+                else:
+                    m = futures[idx].result()
+                if resolve:
+                    m = _resolve_audio_meta(m)
+            except Exception as err:
+                print(&#34;Error with&#34;, str(file_path), err, file=sys.stderr)
+                continue
+            meta.append(m)
+            if progress:
+                print(format((1 + idx) / len(audio_files), &#34; 3.1%&#34;), end=&#39;\r&#39;, file=sys.stderr)
+    meta.sort()
+    return meta
+
+
+def load_audio_meta(path: tp.Union[str, Path],
+                    resolve: bool = True, fast: bool = True) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Load list of AudioMeta from an optionally compressed json file.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
+        fast (bool): activates some tricks to make things faster.
+    Returns:
+        list of AudioMeta: List of audio file path and its total duration.
+    &#34;&#34;&#34;
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;rb&#39;) as fp:  # type: ignore
+        lines = fp.readlines()
+    meta = []
+    for line in lines:
+        d = json.loads(line)
+        m = AudioMeta.from_dict(d)
+        if resolve:
+            m = _resolve_audio_meta(m, fast=fast)
+        meta.append(m)
+    return meta
+
+
+def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
+    &#34;&#34;&#34;Save the audio metadata to the file pointer as json.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        metadata (list of BaseAudioMeta): List of audio meta to save.
+    &#34;&#34;&#34;
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;wb&#39;) as fp:  # type: ignore
+        for m in meta:
+            json_str = json.dumps(m.to_dict()) + &#39;\n&#39;
+            json_bytes = json_str.encode(&#39;utf-8&#39;)
+            fp.write(json_bytes)
+
+
+class AudioDataset:
+    &#34;&#34;&#34;Base audio dataset.
+
+    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+    and potentially additional information, by creating random segments from the list of audio
+    files referenced in the metadata and applying minimal data pre-processing such as resampling,
+    mixing of channels, padding, etc.
+
+    If no segment_duration value is provided, the AudioDataset will return the full wav for each
+    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+    duration, applying padding if required.
+
+    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+    original audio meta.
+
+    Note that you can call `start_epoch(epoch)` in order to get
+    a deterministic &#34;randomization&#34; for `shuffle=True`.
+    For a given epoch and dataset index, this will always return the same extract.
+    You can get back some diversity by setting the `shuffle_seed` param.
+
+    Args:
+        meta (list of AudioMeta): List of audio files metadata.
+        segment_duration (float, optional): Optional segment duration of audio to load.
+            If not specified, the dataset will load the full audio segment from the file.
+        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
+        sample_rate (int): Target sample rate of the loaded audio samples.
+        channels (int): Target number of channels of the loaded audio samples.
+        sample_on_duration (bool): Set to `True` to sample segments with probability
+            dependent on audio file duration. This is only used if `segment_duration` is provided.
+        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
+            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
+            of the file duration and file weight. This is only used if `segment_duration` is provided.
+        min_segment_ratio (float): Minimum segment ratio to use when the audio file
+            is shorter than the desired segment.
+        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
+        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
+            audio shorter than this will be filtered out.
+        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
+            audio longer than this will be filtered out.
+        shuffle_seed (int): can be used to further randomize
+        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
+            with the expected segment_duration (which must be provided if load_wav is False).
+        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
+            are False. Will ensure a permutation on files when going through the dataset.
+            In that case the epoch number must be provided in order for the model
+            to continue the permutation across epochs. In that case, it is assumed
+            that `num_samples = total_batch_size * num_updates_per_epoch`, with
+            `total_batch_size` the overall batch size accounting for all gpus.
+    &#34;&#34;&#34;
+    def __init__(self,
+                 meta: tp.List[AudioMeta],
+                 segment_duration: tp.Optional[float] = None,
+                 shuffle: bool = True,
+                 num_samples: int = 10_000,
+                 sample_rate: int = 48_000,
+                 channels: int = 2,
+                 pad: bool = True,
+                 sample_on_duration: bool = True,
+                 sample_on_weight: bool = True,
+                 min_segment_ratio: float = 0.5,
+                 max_read_retry: int = 10,
+                 return_info: bool = False,
+                 min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None,
+                 shuffle_seed: int = 0,
+                 load_wav: bool = True,
+                 permutation_on_files: bool = False,
+                 ):
+        assert len(meta) &gt; 0, &#34;No audio meta provided to AudioDataset. Please check loading of audio meta.&#34;
+        assert segment_duration is None or segment_duration &gt; 0
+        assert segment_duration is None or min_segment_ratio &gt;= 0
+        self.segment_duration = segment_duration
+        self.min_segment_ratio = min_segment_ratio
+        self.max_audio_duration = max_audio_duration
+        self.min_audio_duration = min_audio_duration
+        if self.min_audio_duration is not None and self.max_audio_duration is not None:
+            assert self.min_audio_duration &lt;= self.max_audio_duration
+        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
+        assert len(self.meta)  # Fail fast if all data has been filtered.
+        self.total_duration = sum(d.duration for d in self.meta)
+
+        if segment_duration is None:
+            num_samples = len(self.meta)
+        self.num_samples = num_samples
+        self.shuffle = shuffle
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.pad = pad
+        self.sample_on_weight = sample_on_weight
+        self.sample_on_duration = sample_on_duration
+        self.sampling_probabilities = self._get_sampling_probabilities()
+        self.max_read_retry = max_read_retry
+        self.return_info = return_info
+        self.shuffle_seed = shuffle_seed
+        self.current_epoch: tp.Optional[int] = None
+        self.load_wav = load_wav
+        if not load_wav:
+            assert segment_duration is not None
+        self.permutation_on_files = permutation_on_files
+        if permutation_on_files:
+            assert not self.sample_on_duration
+            assert not self.sample_on_weight
+            assert self.shuffle
+
+    def start_epoch(self, epoch: int):
+        self.current_epoch = epoch
+
+    def __len__(self):
+        return self.num_samples
+
+    def _get_sampling_probabilities(self, normalized: bool = True):
+        &#34;&#34;&#34;Return the sampling probabilities for each file inside `self.meta`.&#34;&#34;&#34;
+        scores: tp.List[float] = []
+        for file_meta in self.meta:
+            score = 1.
+            if self.sample_on_weight and file_meta.weight is not None:
+                score *= file_meta.weight
+            if self.sample_on_duration:
+                score *= file_meta.duration
+            scores.append(score)
+        probabilities = torch.tensor(scores)
+        if normalized:
+            probabilities /= probabilities.sum()
+        return probabilities
+
+    @staticmethod
+    @lru_cache(16)
+    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
+        # Used to keep the most recent files permutation in memory implicitely.
+        # will work unless someone is using a lot of Datasets in parallel.
+        rng = torch.Generator()
+        rng.manual_seed(base_seed + permutation_index)
+        return torch.randperm(num_files, generator=rng)
+
+    def sample_file(self, index: int, rng: torch.Generator) -&gt; AudioMeta:
+        &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overridden in subclasses.
+        This is only called if `segment_duration` is not None.
+
+        You must use the provided random number generator `rng` for reproducibility.
+        You can further make use of the index accessed.
+        &#34;&#34;&#34;
+        if self.permutation_on_files:
+            assert self.current_epoch is not None
+            total_index = self.current_epoch * len(self) + index
+            permutation_index = total_index // len(self.meta)
+            relative_index = total_index % len(self.meta)
+            permutation = AudioDataset._get_file_permutation(
+                len(self.meta), permutation_index, self.shuffle_seed)
+            file_index = permutation[relative_index]
+            return self.meta[file_index]
+
+        if not self.sample_on_weight and not self.sample_on_duration:
+            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+        else:
+            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+        return self.meta[file_index]
+
+    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
+        # Override this method in subclass if needed.
+        if self.load_wav:
+            return audio_read(path, seek_time, duration, pad=False)
+        else:
+            assert self.segment_duration is not None
+            n_frames = int(self.sample_rate * self.segment_duration)
+            return torch.zeros(self.channels, n_frames), self.sample_rate
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
+        if self.segment_duration is None:
+            file_meta = self.meta[index]
+            out, sr = audio_read(file_meta.path)
+            out = convert_audio(out, sr, self.sample_rate, self.channels)
+            n_frames = out.shape[-1]
+            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate, channels=out.shape[0])
+        else:
+            rng = torch.Generator()
+            if self.shuffle:
+                # We use index, plus extra randomness, either totally random if we don&#39;t know the epoch.
+                # otherwise we make use of the epoch number and optional shuffle_seed.
+                if self.current_epoch is None:
+                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+                else:
+                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
+            else:
+                # We only use index
+                rng.manual_seed(index)
+
+            for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(index, rng)
+                # We add some variance in the file position even if audio file is smaller than segment
+                # without ending up with empty segments
+                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
+                seek_time = torch.rand(1, generator=rng).item() * max_seek
+                try:
+                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
+                    out = convert_audio(out, sr, self.sample_rate, self.channels)
+                    n_frames = out.shape[-1]
+                    target_frames = int(self.segment_duration * self.sample_rate)
+                    if self.pad:
+                        out = F.pad(out, (0, target_frames - n_frames))
+                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate, channels=out.shape[0])
+                except Exception as exc:
+                    logger.warning(&#34;Error opening file %s: %r&#34;, file_meta.path, exc)
+                    if retry == self.max_read_retry - 1:
+                        raise
+                else:
+                    break
+
+        if self.return_info:
+            # Returns the wav and additional information on the wave segment
+            return out, segment_info
+        else:
+            return out
+
+    def collater(self, samples):
+        &#34;&#34;&#34;The collater function has to be provided to the dataloader
+        if AudioDataset has return_info=True in order to properly collate
+        the samples of a batch.
+        &#34;&#34;&#34;
+        if self.segment_duration is None and len(samples) &gt; 1:
+            assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+        # In this case the audio reaching the collater is of variable length as segment_duration=None.
+        to_pad = self.segment_duration is None and self.pad
+        if to_pad:
+            max_len = max([wav.shape[-1] for wav, _ in samples])
+
+            def _pad_wav(wav):
+                return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+        if self.return_info:
+            if len(samples) &gt; 0:
+                assert len(samples[0]) == 2
+                assert isinstance(samples[0][0], torch.Tensor)
+                assert isinstance(samples[0][1], SegmentInfo)
+
+            wavs = [wav for wav, _ in samples]
+            segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+            if to_pad:
+                # Each wav could be of a different duration as they are not segmented.
+                for i in range(len(samples)):
+                    # Determines the total length of the signal with padding, so we update here as we pad.
+                    segment_infos[i].total_frames = max_len
+                    wavs[i] = _pad_wav(wavs[i])
+
+            wav = torch.stack(wavs)
+            return wav, segment_infos
+        else:
+            assert isinstance(samples[0], torch.Tensor)
+            if to_pad:
+                samples = [_pad_wav(s) for s in samples]
+            return torch.stack(samples)
+
+    def _filter_duration(self, meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+        &#34;&#34;&#34;Filters out audio files with audio durations that will not allow to sample examples from them.&#34;&#34;&#34;
+        orig_len = len(meta)
+
+        # Filter data that is too short.
+        if self.min_audio_duration is not None:
+            meta = [m for m in meta if m.duration &gt;= self.min_audio_duration]
+
+        # Filter data that is too long.
+        if self.max_audio_duration is not None:
+            meta = [m for m in meta if m.duration &lt;= self.max_audio_duration]
+
+        filtered_len = len(meta)
+        removed_percentage = 100*(1-float(filtered_len)/orig_len)
+        msg = &#39;Removed %.2f percent of the data because it was too short or too long.&#39; % removed_percentage
+        if removed_percentage &lt; 10:
+            logging.debug(msg)
+        else:
+            logging.warning(msg)
+        return meta
+
+    @classmethod
+    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_dir():
+            if (root / &#39;data.jsonl&#39;).exists():
+                root = root / &#39;data.jsonl&#39;
+            elif (root / &#39;data.jsonl.gz&#39;).exists():
+                root = root / &#39;data.jsonl.gz&#39;
+            else:
+                raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                                 &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+        meta = load_audio_meta(root)
+        return cls(meta, **kwargs)
+
+    @classmethod
+    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            minimal_meta (bool): Whether to only load minimal metadata or not.
+            exts (list of str): Extensions for audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_file():
+            meta = load_audio_meta(root, resolve=True)
+        else:
+            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+        return cls(meta, **kwargs)
+
+
+def main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog=&#39;audio_dataset&#39;,
+        description=&#39;Generate .jsonl files by scanning a folder.&#39;)
+    parser.add_argument(&#39;root&#39;, help=&#39;Root folder with all the audio files&#39;)
+    parser.add_argument(&#39;output_meta_file&#39;,
+                        help=&#39;Output file to store the metadata, &#39;)
+    parser.add_argument(&#39;--complete&#39;,
+                        action=&#39;store_false&#39;, dest=&#39;minimal&#39;, default=True,
+                        help=&#39;Retrieve all metadata, even the one that are expansive &#39;
+                             &#39;to compute (e.g. normalization).&#39;)
+    parser.add_argument(&#39;--resolve&#39;,
+                        action=&#39;store_true&#39;, default=False,
+                        help=&#39;Resolve the paths to be absolute and with no symlinks.&#39;)
+    parser.add_argument(&#39;--workers&#39;,
+                        default=10, type=int,
+                        help=&#39;Number of workers.&#39;)
+    args = parser.parse_args()
+    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
+                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
+    save_audio_meta(args.output_meta_file, meta)
+
+
+if __name__ == &#39;__main__&#39;:
+    main()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio_dataset.find_audio_files"><code class="name flex">
+<span>def <span class="ident">find_audio_files</span></span>(<span>path: Union[str, pathlib.Path], exts: List[str] = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'], resolve: bool = True, minimal: bool = True, progress: bool = False, workers: int = 0) ‑> List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build a list of AudioMeta from a given path,
+collecting relevant audio files and fetching meta info.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to folder containing audio files.</dd>
+<dt><strong><code>exts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>List of file extensions to consider for audio files.</dd>
+<dt><strong><code>minimal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to only load the minimal set of metadata (takes longer if not).</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to log progress on audio files collection.</dd>
+<dt><strong><code>workers</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of parallel workers, if 0, use only the current thread.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>list</code> of <code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>List of audio file path and its metadata.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def find_audio_files(path: tp.Union[Path, str],
+                     exts: tp.List[str] = DEFAULT_EXTS,
+                     resolve: bool = True,
+                     minimal: bool = True,
+                     progress: bool = False,
+                     workers: int = 0) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Build a list of AudioMeta from a given path,
+    collecting relevant audio files and fetching meta info.
+
+    Args:
+        path (str or Path): Path to folder containing audio files.
+        exts (list of str): List of file extensions to consider for audio files.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+        progress (bool): Whether to log progress on audio files collection.
+        workers (int): number of parallel workers, if 0, use only the current thread.
+    Returns:
+        list of AudioMeta: List of audio file path and its metadata.
+    &#34;&#34;&#34;
+    audio_files = []
+    futures: tp.List[Future] = []
+    pool: tp.Optional[ThreadPoolExecutor] = None
+    with ExitStack() as stack:
+        if workers &gt; 0:
+            pool = ThreadPoolExecutor(workers)
+            stack.enter_context(pool)
+
+        if progress:
+            print(&#34;Finding audio files...&#34;)
+        for root, folders, files in os.walk(path, followlinks=True):
+            for file in files:
+                full_path = Path(root) / file
+                if full_path.suffix.lower() in exts:
+                    audio_files.append(full_path)
+                    if pool is not None:
+                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
+                    if progress:
+                        print(format(len(audio_files), &#34; 8d&#34;), end=&#39;\r&#39;, file=sys.stderr)
+
+        if progress:
+            print(&#34;Getting audio metadata...&#34;)
+        meta: tp.List[AudioMeta] = []
+        for idx, file_path in enumerate(audio_files):
+            try:
+                if pool is None:
+                    m = _get_audio_meta(str(file_path), minimal)
+                else:
+                    m = futures[idx].result()
+                if resolve:
+                    m = _resolve_audio_meta(m)
+            except Exception as err:
+                print(&#34;Error with&#34;, str(file_path), err, file=sys.stderr)
+                continue
+            meta.append(m)
+            if progress:
+                print(format((1 + idx) / len(audio_files), &#34; 3.1%&#34;), end=&#39;\r&#39;, file=sys.stderr)
+    meta.sort()
+    return meta</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.load_audio_meta"><code class="name flex">
+<span>def <span class="ident">load_audio_meta</span></span>(<span>path: Union[str, pathlib.Path], resolve: bool = True, fast: bool = True) ‑> List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Load list of AudioMeta from an optionally compressed json file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to JSON file.</dd>
+<dt><strong><code>resolve</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to resolve the path from AudioMeta (default=True).</dd>
+<dt><strong><code>fast</code></strong> :&ensp;<code>bool</code></dt>
+<dd>activates some tricks to make things faster.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>list</code> of <code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>List of audio file path and its total duration.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_audio_meta(path: tp.Union[str, Path],
+                    resolve: bool = True, fast: bool = True) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Load list of AudioMeta from an optionally compressed json file.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
+        fast (bool): activates some tricks to make things faster.
+    Returns:
+        list of AudioMeta: List of audio file path and its total duration.
+    &#34;&#34;&#34;
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;rb&#39;) as fp:  # type: ignore
+        lines = fp.readlines()
+    meta = []
+    for line in lines:
+        d = json.loads(line)
+        m = AudioMeta.from_dict(d)
+        if resolve:
+            m = _resolve_audio_meta(m, fast=fast)
+        meta.append(m)
+    return meta</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.main"><code class="name flex">
+<span>def <span class="ident">main</span></span>(<span>)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog=&#39;audio_dataset&#39;,
+        description=&#39;Generate .jsonl files by scanning a folder.&#39;)
+    parser.add_argument(&#39;root&#39;, help=&#39;Root folder with all the audio files&#39;)
+    parser.add_argument(&#39;output_meta_file&#39;,
+                        help=&#39;Output file to store the metadata, &#39;)
+    parser.add_argument(&#39;--complete&#39;,
+                        action=&#39;store_false&#39;, dest=&#39;minimal&#39;, default=True,
+                        help=&#39;Retrieve all metadata, even the one that are expansive &#39;
+                             &#39;to compute (e.g. normalization).&#39;)
+    parser.add_argument(&#39;--resolve&#39;,
+                        action=&#39;store_true&#39;, default=False,
+                        help=&#39;Resolve the paths to be absolute and with no symlinks.&#39;)
+    parser.add_argument(&#39;--workers&#39;,
+                        default=10, type=int,
+                        help=&#39;Number of workers.&#39;)
+    args = parser.parse_args()
+    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
+                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
+    save_audio_meta(args.output_meta_file, meta)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.save_audio_meta"><code class="name flex">
+<span>def <span class="ident">save_audio_meta</span></span>(<span>path: Union[str, pathlib.Path], meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Save the audio metadata to the file pointer as json.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to JSON file.</dd>
+<dt><strong><code>metadata</code></strong> :&ensp;<code>list</code> of <code>BaseAudioMeta</code></dt>
+<dd>List of audio meta to save.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
+    &#34;&#34;&#34;Save the audio metadata to the file pointer as json.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        metadata (list of BaseAudioMeta): List of audio meta to save.
+    &#34;&#34;&#34;
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;wb&#39;) as fp:  # type: ignore
+        for m in meta:
+            json_str = json.dumps(m.to_dict()) + &#39;\n&#39;
+            json_bytes = json_str.encode(&#39;utf-8&#39;)
+            fp.write(json_bytes)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset"><code class="flex name class">
+<span>class <span class="ident">AudioDataset</span></span>
+<span>(</span><span>meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>], segment_duration: Optional[float] = None, shuffle: bool = True, num_samples: int = 10000, sample_rate: int = 48000, channels: int = 2, pad: bool = True, sample_on_duration: bool = True, sample_on_weight: bool = True, min_segment_ratio: float = 0.5, max_read_retry: int = 10, return_info: bool = False, min_audio_duration: Optional[float] = None, max_audio_duration: Optional[float] = None, shuffle_seed: int = 0, load_wav: bool = True, permutation_on_files: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base audio dataset.</p>
+<p>The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+and potentially additional information, by creating random segments from the list of audio
+files referenced in the metadata and applying minimal data pre-processing such as resampling,
+mixing of channels, padding, etc.</p>
+<p>If no segment_duration value is provided, the AudioDataset will return the full wav for each
+audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+duration, applying padding if required.</p>
+<p>By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+original audio meta.</p>
+<p>Note that you can call <code>start_epoch(epoch)</code> in order to get
+a deterministic "randomization" for <code>shuffle=True</code>.
+For a given epoch and dataset index, this will always return the same extract.
+You can get back some diversity by setting the <code>shuffle_seed</code> param.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>meta</code></strong> :&ensp;<code>list</code> of <code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>List of audio files metadata.</dd>
+<dt><strong><code>segment_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Optional segment duration of audio to load.
+If not specified, the dataset will load the full audio segment from the file.</dd>
+<dt><strong><code>shuffle</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to have the data reshuffled at every epoch.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Target sample rate of the loaded audio samples.</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Target number of channels of the loaded audio samples.</dd>
+<dt><strong><code>sample_on_duration</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to sample segments with probability
+dependent on audio file duration. This is only used if <code>segment_duration</code> is provided.</dd>
+<dt><strong><code>sample_on_weight</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to sample segments using the <code>weight</code> entry of
+<code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code>. If <code>sample_on_duration</code> is also True, the actual weight will be the product
+of the file duration and file weight. This is only used if <code>segment_duration</code> is provided.</dd>
+<dt><strong><code>min_segment_ratio</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum segment ratio to use when the audio file
+is shorter than the desired segment.</dd>
+<dt><strong><code>max_read_retry</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum number of retries to sample an audio segment from the dataset.</dd>
+<dt><strong><code>return_info</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to return the wav only or return wav along with segment info and metadata.</dd>
+<dt><strong><code>min_audio_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Minimum audio file duration, in seconds, if provided
+audio shorter than this will be filtered out.</dd>
+<dt><strong><code>max_audio_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Maximal audio file duration in seconds, if provided
+audio longer than this will be filtered out.</dd>
+<dt><strong><code>shuffle_seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>can be used to further randomize</dd>
+<dt><strong><code>load_wav</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if False, skip loading the wav but returns a tensor of 0
+with the expected segment_duration (which must be provided if load_wav is False).</dd>
+<dt><strong><code>permutation_on_files</code></strong> :&ensp;<code>bool</code></dt>
+<dd>only if <code>sample_on_weight</code> and <code>sample_on_duration</code>
+are False. Will ensure a permutation on files when going through the dataset.
+In that case the epoch number must be provided in order for the model
+to continue the permutation across epochs. In that case, it is assumed
+that <code>num_samples = total_batch_size * num_updates_per_epoch</code>, with
+<code>total_batch_size</code> the overall batch size accounting for all gpus.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioDataset:
+    &#34;&#34;&#34;Base audio dataset.
+
+    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+    and potentially additional information, by creating random segments from the list of audio
+    files referenced in the metadata and applying minimal data pre-processing such as resampling,
+    mixing of channels, padding, etc.
+
+    If no segment_duration value is provided, the AudioDataset will return the full wav for each
+    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+    duration, applying padding if required.
+
+    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+    original audio meta.
+
+    Note that you can call `start_epoch(epoch)` in order to get
+    a deterministic &#34;randomization&#34; for `shuffle=True`.
+    For a given epoch and dataset index, this will always return the same extract.
+    You can get back some diversity by setting the `shuffle_seed` param.
+
+    Args:
+        meta (list of AudioMeta): List of audio files metadata.
+        segment_duration (float, optional): Optional segment duration of audio to load.
+            If not specified, the dataset will load the full audio segment from the file.
+        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
+        sample_rate (int): Target sample rate of the loaded audio samples.
+        channels (int): Target number of channels of the loaded audio samples.
+        sample_on_duration (bool): Set to `True` to sample segments with probability
+            dependent on audio file duration. This is only used if `segment_duration` is provided.
+        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
+            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
+            of the file duration and file weight. This is only used if `segment_duration` is provided.
+        min_segment_ratio (float): Minimum segment ratio to use when the audio file
+            is shorter than the desired segment.
+        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
+        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
+            audio shorter than this will be filtered out.
+        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
+            audio longer than this will be filtered out.
+        shuffle_seed (int): can be used to further randomize
+        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
+            with the expected segment_duration (which must be provided if load_wav is False).
+        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
+            are False. Will ensure a permutation on files when going through the dataset.
+            In that case the epoch number must be provided in order for the model
+            to continue the permutation across epochs. In that case, it is assumed
+            that `num_samples = total_batch_size * num_updates_per_epoch`, with
+            `total_batch_size` the overall batch size accounting for all gpus.
+    &#34;&#34;&#34;
+    def __init__(self,
+                 meta: tp.List[AudioMeta],
+                 segment_duration: tp.Optional[float] = None,
+                 shuffle: bool = True,
+                 num_samples: int = 10_000,
+                 sample_rate: int = 48_000,
+                 channels: int = 2,
+                 pad: bool = True,
+                 sample_on_duration: bool = True,
+                 sample_on_weight: bool = True,
+                 min_segment_ratio: float = 0.5,
+                 max_read_retry: int = 10,
+                 return_info: bool = False,
+                 min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None,
+                 shuffle_seed: int = 0,
+                 load_wav: bool = True,
+                 permutation_on_files: bool = False,
+                 ):
+        assert len(meta) &gt; 0, &#34;No audio meta provided to AudioDataset. Please check loading of audio meta.&#34;
+        assert segment_duration is None or segment_duration &gt; 0
+        assert segment_duration is None or min_segment_ratio &gt;= 0
+        self.segment_duration = segment_duration
+        self.min_segment_ratio = min_segment_ratio
+        self.max_audio_duration = max_audio_duration
+        self.min_audio_duration = min_audio_duration
+        if self.min_audio_duration is not None and self.max_audio_duration is not None:
+            assert self.min_audio_duration &lt;= self.max_audio_duration
+        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
+        assert len(self.meta)  # Fail fast if all data has been filtered.
+        self.total_duration = sum(d.duration for d in self.meta)
+
+        if segment_duration is None:
+            num_samples = len(self.meta)
+        self.num_samples = num_samples
+        self.shuffle = shuffle
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.pad = pad
+        self.sample_on_weight = sample_on_weight
+        self.sample_on_duration = sample_on_duration
+        self.sampling_probabilities = self._get_sampling_probabilities()
+        self.max_read_retry = max_read_retry
+        self.return_info = return_info
+        self.shuffle_seed = shuffle_seed
+        self.current_epoch: tp.Optional[int] = None
+        self.load_wav = load_wav
+        if not load_wav:
+            assert segment_duration is not None
+        self.permutation_on_files = permutation_on_files
+        if permutation_on_files:
+            assert not self.sample_on_duration
+            assert not self.sample_on_weight
+            assert self.shuffle
+
+    def start_epoch(self, epoch: int):
+        self.current_epoch = epoch
+
+    def __len__(self):
+        return self.num_samples
+
+    def _get_sampling_probabilities(self, normalized: bool = True):
+        &#34;&#34;&#34;Return the sampling probabilities for each file inside `self.meta`.&#34;&#34;&#34;
+        scores: tp.List[float] = []
+        for file_meta in self.meta:
+            score = 1.
+            if self.sample_on_weight and file_meta.weight is not None:
+                score *= file_meta.weight
+            if self.sample_on_duration:
+                score *= file_meta.duration
+            scores.append(score)
+        probabilities = torch.tensor(scores)
+        if normalized:
+            probabilities /= probabilities.sum()
+        return probabilities
+
+    @staticmethod
+    @lru_cache(16)
+    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
+        # Used to keep the most recent files permutation in memory implicitely.
+        # will work unless someone is using a lot of Datasets in parallel.
+        rng = torch.Generator()
+        rng.manual_seed(base_seed + permutation_index)
+        return torch.randperm(num_files, generator=rng)
+
+    def sample_file(self, index: int, rng: torch.Generator) -&gt; AudioMeta:
+        &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overridden in subclasses.
+        This is only called if `segment_duration` is not None.
+
+        You must use the provided random number generator `rng` for reproducibility.
+        You can further make use of the index accessed.
+        &#34;&#34;&#34;
+        if self.permutation_on_files:
+            assert self.current_epoch is not None
+            total_index = self.current_epoch * len(self) + index
+            permutation_index = total_index // len(self.meta)
+            relative_index = total_index % len(self.meta)
+            permutation = AudioDataset._get_file_permutation(
+                len(self.meta), permutation_index, self.shuffle_seed)
+            file_index = permutation[relative_index]
+            return self.meta[file_index]
+
+        if not self.sample_on_weight and not self.sample_on_duration:
+            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+        else:
+            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+        return self.meta[file_index]
+
+    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
+        # Override this method in subclass if needed.
+        if self.load_wav:
+            return audio_read(path, seek_time, duration, pad=False)
+        else:
+            assert self.segment_duration is not None
+            n_frames = int(self.sample_rate * self.segment_duration)
+            return torch.zeros(self.channels, n_frames), self.sample_rate
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
+        if self.segment_duration is None:
+            file_meta = self.meta[index]
+            out, sr = audio_read(file_meta.path)
+            out = convert_audio(out, sr, self.sample_rate, self.channels)
+            n_frames = out.shape[-1]
+            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate, channels=out.shape[0])
+        else:
+            rng = torch.Generator()
+            if self.shuffle:
+                # We use index, plus extra randomness, either totally random if we don&#39;t know the epoch.
+                # otherwise we make use of the epoch number and optional shuffle_seed.
+                if self.current_epoch is None:
+                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+                else:
+                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
+            else:
+                # We only use index
+                rng.manual_seed(index)
+
+            for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(index, rng)
+                # We add some variance in the file position even if audio file is smaller than segment
+                # without ending up with empty segments
+                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
+                seek_time = torch.rand(1, generator=rng).item() * max_seek
+                try:
+                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
+                    out = convert_audio(out, sr, self.sample_rate, self.channels)
+                    n_frames = out.shape[-1]
+                    target_frames = int(self.segment_duration * self.sample_rate)
+                    if self.pad:
+                        out = F.pad(out, (0, target_frames - n_frames))
+                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate, channels=out.shape[0])
+                except Exception as exc:
+                    logger.warning(&#34;Error opening file %s: %r&#34;, file_meta.path, exc)
+                    if retry == self.max_read_retry - 1:
+                        raise
+                else:
+                    break
+
+        if self.return_info:
+            # Returns the wav and additional information on the wave segment
+            return out, segment_info
+        else:
+            return out
+
+    def collater(self, samples):
+        &#34;&#34;&#34;The collater function has to be provided to the dataloader
+        if AudioDataset has return_info=True in order to properly collate
+        the samples of a batch.
+        &#34;&#34;&#34;
+        if self.segment_duration is None and len(samples) &gt; 1:
+            assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+        # In this case the audio reaching the collater is of variable length as segment_duration=None.
+        to_pad = self.segment_duration is None and self.pad
+        if to_pad:
+            max_len = max([wav.shape[-1] for wav, _ in samples])
+
+            def _pad_wav(wav):
+                return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+        if self.return_info:
+            if len(samples) &gt; 0:
+                assert len(samples[0]) == 2
+                assert isinstance(samples[0][0], torch.Tensor)
+                assert isinstance(samples[0][1], SegmentInfo)
+
+            wavs = [wav for wav, _ in samples]
+            segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+            if to_pad:
+                # Each wav could be of a different duration as they are not segmented.
+                for i in range(len(samples)):
+                    # Determines the total length of the signal with padding, so we update here as we pad.
+                    segment_infos[i].total_frames = max_len
+                    wavs[i] = _pad_wav(wavs[i])
+
+            wav = torch.stack(wavs)
+            return wav, segment_infos
+        else:
+            assert isinstance(samples[0], torch.Tensor)
+            if to_pad:
+                samples = [_pad_wav(s) for s in samples]
+            return torch.stack(samples)
+
+    def _filter_duration(self, meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+        &#34;&#34;&#34;Filters out audio files with audio durations that will not allow to sample examples from them.&#34;&#34;&#34;
+        orig_len = len(meta)
+
+        # Filter data that is too short.
+        if self.min_audio_duration is not None:
+            meta = [m for m in meta if m.duration &gt;= self.min_audio_duration]
+
+        # Filter data that is too long.
+        if self.max_audio_duration is not None:
+            meta = [m for m in meta if m.duration &lt;= self.max_audio_duration]
+
+        filtered_len = len(meta)
+        removed_percentage = 100*(1-float(filtered_len)/orig_len)
+        msg = &#39;Removed %.2f percent of the data because it was too short or too long.&#39; % removed_percentage
+        if removed_percentage &lt; 10:
+            logging.debug(msg)
+        else:
+            logging.warning(msg)
+        return meta
+
+    @classmethod
+    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_dir():
+            if (root / &#39;data.jsonl&#39;).exists():
+                root = root / &#39;data.jsonl&#39;
+            elif (root / &#39;data.jsonl.gz&#39;).exists():
+                root = root / &#39;data.jsonl.gz&#39;
+            else:
+                raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                                 &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+        meta = load_audio_meta(root)
+        return cls(meta, **kwargs)
+
+    @classmethod
+    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            minimal_meta (bool): Whether to only load minimal metadata or not.
+            exts (list of str): Extensions for audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_file():
+            meta = load_audio_meta(root, resolve=True)
+        else:
+            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+        return cls(meta, **kwargs)</code></pre>
+</details>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></li>
+</ul>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.from_meta"><code class="name flex">
+<span>def <span class="ident">from_meta</span></span>(<span>root: Union[str, pathlib.Path], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>root</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to root folder containing audio files.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional keyword arguments for the AudioDataset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+    &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+    Args:
+        root (str or Path): Path to root folder containing audio files.
+        kwargs: Additional keyword arguments for the AudioDataset.
+    &#34;&#34;&#34;
+    root = Path(root)
+    if root.is_dir():
+        if (root / &#39;data.jsonl&#39;).exists():
+            root = root / &#39;data.jsonl&#39;
+        elif (root / &#39;data.jsonl.gz&#39;).exists():
+            root = root / &#39;data.jsonl.gz&#39;
+        else:
+            raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                             &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+    meta = load_audio_meta(root)
+    return cls(meta, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.from_path"><code class="name flex">
+<span>def <span class="ident">from_path</span></span>(<span>root: Union[str, pathlib.Path], minimal_meta: bool = True, exts: List[str] = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate AudioDataset from a path containing (possibly nested) audio files.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>root</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to root folder containing audio files.</dd>
+<dt><strong><code>minimal_meta</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to only load minimal metadata or not.</dd>
+<dt><strong><code>exts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>Extensions for audio files.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional keyword arguments for the AudioDataset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+              exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+    &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+    Args:
+        root (str or Path): Path to root folder containing audio files.
+        minimal_meta (bool): Whether to only load minimal metadata or not.
+        exts (list of str): Extensions for audio files.
+        kwargs: Additional keyword arguments for the AudioDataset.
+    &#34;&#34;&#34;
+    root = Path(root)
+    if root.is_file():
+        meta = load_audio_meta(root, resolve=True)
+    else:
+        meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+    return cls(meta, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.collater"><code class="name flex">
+<span>def <span class="ident">collater</span></span>(<span>self, samples)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>The collater function has to be provided to the dataloader
+if AudioDataset has return_info=True in order to properly collate
+the samples of a batch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def collater(self, samples):
+    &#34;&#34;&#34;The collater function has to be provided to the dataloader
+    if AudioDataset has return_info=True in order to properly collate
+    the samples of a batch.
+    &#34;&#34;&#34;
+    if self.segment_duration is None and len(samples) &gt; 1:
+        assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+    # In this case the audio reaching the collater is of variable length as segment_duration=None.
+    to_pad = self.segment_duration is None and self.pad
+    if to_pad:
+        max_len = max([wav.shape[-1] for wav, _ in samples])
+
+        def _pad_wav(wav):
+            return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+    if self.return_info:
+        if len(samples) &gt; 0:
+            assert len(samples[0]) == 2
+            assert isinstance(samples[0][0], torch.Tensor)
+            assert isinstance(samples[0][1], SegmentInfo)
+
+        wavs = [wav for wav, _ in samples]
+        segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+        if to_pad:
+            # Each wav could be of a different duration as they are not segmented.
+            for i in range(len(samples)):
+                # Determines the total length of the signal with padding, so we update here as we pad.
+                segment_infos[i].total_frames = max_len
+                wavs[i] = _pad_wav(wavs[i])
+
+        wav = torch.stack(wavs)
+        return wav, segment_infos
+    else:
+        assert isinstance(samples[0], torch.Tensor)
+        if to_pad:
+            samples = [_pad_wav(s) for s in samples]
+        return torch.stack(samples)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.sample_file"><code class="name flex">
+<span>def <span class="ident">sample_file</span></span>(<span>self, index: int, rng: torch._C.Generator) ‑> <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample a given file from <code>self.meta</code>. Can be overridden in subclasses.
+This is only called if <code>segment_duration</code> is not None.</p>
+<p>You must use the provided random number generator <code>rng</code> for reproducibility.
+You can further make use of the index accessed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_file(self, index: int, rng: torch.Generator) -&gt; AudioMeta:
+    &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overridden in subclasses.
+    This is only called if `segment_duration` is not None.
+
+    You must use the provided random number generator `rng` for reproducibility.
+    You can further make use of the index accessed.
+    &#34;&#34;&#34;
+    if self.permutation_on_files:
+        assert self.current_epoch is not None
+        total_index = self.current_epoch * len(self) + index
+        permutation_index = total_index // len(self.meta)
+        relative_index = total_index % len(self.meta)
+        permutation = AudioDataset._get_file_permutation(
+            len(self.meta), permutation_index, self.shuffle_seed)
+        file_index = permutation[relative_index]
+        return self.meta[file_index]
+
+    if not self.sample_on_weight and not self.sample_on_duration:
+        file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+    else:
+        file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+    return self.meta[file_index]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.start_epoch"><code class="name flex">
+<span>def <span class="ident">start_epoch</span></span>(<span>self, epoch: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def start_epoch(self, epoch: int):
+    self.current_epoch = epoch</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta"><code class="flex name class">
+<span>class <span class="ident">AudioMeta</span></span>
+<span>(</span><span>path: str, duration: float, sample_rate: int, amplitude: Optional[float] = None, weight: Optional[float] = None, info_path: Optional[<a title="audiocraft.data.zip.PathInZip" href="zip.html#audiocraft.data.zip.PathInZip">PathInZip</a>] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioMeta(path: str, duration: float, sample_rate: int, amplitude: Union[float, NoneType] = None, weight: Union[float, NoneType] = None, info_path: Union[audiocraft.data.zip.PathInZip, NoneType] = None)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioMeta(BaseInfo):
+    path: str
+    duration: float
+    sample_rate: int
+    amplitude: tp.Optional[float] = None
+    weight: tp.Optional[float] = None
+    # info_path is used to load additional information about the audio file that is stored in zip files.
+    info_path: tp.Optional[PathInZip] = None
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        base = cls._dict2fields(dictionary)
+        if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+            base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+        return cls(**base)
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d[&#39;info_path&#39;] is not None:
+            d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+        return d</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.amplitude"><code class="name">var <span class="ident">amplitude</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.info_path"><code class="name">var <span class="ident">info_path</span> : Optional[<a title="audiocraft.data.zip.PathInZip" href="zip.html#audiocraft.data.zip.PathInZip">PathInZip</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.path"><code class="name">var <span class="ident">path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.weight"><code class="name">var <span class="ident">weight</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict):
+    base = cls._dict2fields(dictionary)
+    if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+        base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+    return cls(**base)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.to_dict"><code class="name flex">
+<span>def <span class="ident">to_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_dict(self):
+    d = super().to_dict()
+    if d[&#39;info_path&#39;] is not None:
+        d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+    return d</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.BaseInfo"><code class="flex name class">
+<span>class <span class="ident">BaseInfo</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>BaseInfo()</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseInfo:
+
+    @classmethod
+    def _dict2fields(cls, dictionary: dict):
+        return {
+            field.name: dictionary[field.name]
+            for field in fields(cls) if field.name in dictionary
+        }
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        _dictionary = cls._dict2fields(dictionary)
+        return cls(**_dictionary)
+
+    def to_dict(self):
+        return {
+            field.name: self.__getattribute__(field.name)
+            for field in fields(self)
+            }</code></pre>
+</details>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></li>
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+</ul>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.BaseInfo.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict):
+    _dictionary = cls._dict2fields(dictionary)
+    return cls(**_dictionary)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.BaseInfo.to_dict"><code class="name flex">
+<span>def <span class="ident">to_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_dict(self):
+    return {
+        field.name: self.__getattribute__(field.name)
+        for field in fields(self)
+        }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo"><code class="flex name class">
+<span>class <span class="ident">SegmentInfo</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SegmentInfo(meta: audiocraft.data.audio_dataset.AudioMeta, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SegmentInfo(BaseInfo):
+    meta: AudioMeta
+    seek_time: float
+    # The following values are given once the audio is processed, e.g.
+    # at the target sample rate and target number of channels.
+    n_frames: int      # actual number of frames without padding
+    total_frames: int  # total number of frames, padding included
+    sample_rate: int   # actual sample rate
+    channels: int      # number of audio channels.</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.meta"><code class="name">var <span class="ident">meta</span> : <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.n_frames"><code class="name">var <span class="ident">n_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.seek_time"><code class="name">var <span class="ident">seek_time</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.total_frames"><code class="name">var <span class="ident">total_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.find_audio_files" href="#audiocraft.data.audio_dataset.find_audio_files">find_audio_files</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.load_audio_meta" href="#audiocraft.data.audio_dataset.load_audio_meta">load_audio_meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.main" href="#audiocraft.data.audio_dataset.main">main</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.save_audio_meta" href="#audiocraft.data.audio_dataset.save_audio_meta">save_audio_meta</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.AudioDataset" href="#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.collater" href="#audiocraft.data.audio_dataset.AudioDataset.collater">collater</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_meta" href="#audiocraft.data.audio_dataset.AudioDataset.from_meta">from_meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_path" href="#audiocraft.data.audio_dataset.AudioDataset.from_path">from_path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.sample_file" href="#audiocraft.data.audio_dataset.AudioDataset.sample_file">sample_file</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.start_epoch" href="#audiocraft.data.audio_dataset.AudioDataset.start_epoch">start_epoch</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.amplitude" href="#audiocraft.data.audio_dataset.AudioMeta.amplitude">amplitude</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.duration" href="#audiocraft.data.audio_dataset.AudioMeta.duration">duration</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.from_dict" href="#audiocraft.data.audio_dataset.AudioMeta.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.info_path" href="#audiocraft.data.audio_dataset.AudioMeta.info_path">info_path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.path" href="#audiocraft.data.audio_dataset.AudioMeta.path">path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.sample_rate" href="#audiocraft.data.audio_dataset.AudioMeta.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.to_dict" href="#audiocraft.data.audio_dataset.AudioMeta.to_dict">to_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.weight" href="#audiocraft.data.audio_dataset.AudioMeta.weight">weight</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.BaseInfo.from_dict" href="#audiocraft.data.audio_dataset.BaseInfo.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.BaseInfo.to_dict" href="#audiocraft.data.audio_dataset.BaseInfo.to_dict">to_dict</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.SegmentInfo" href="#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.channels" href="#audiocraft.data.audio_dataset.SegmentInfo.channels">channels</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.meta" href="#audiocraft.data.audio_dataset.SegmentInfo.meta">meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.n_frames" href="#audiocraft.data.audio_dataset.SegmentInfo.n_frames">n_frames</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.sample_rate" href="#audiocraft.data.audio_dataset.SegmentInfo.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.seek_time" href="#audiocraft.data.audio_dataset.SegmentInfo.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.total_frames" href="#audiocraft.data.audio_dataset.SegmentInfo.total_frames">total_frames</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/audio_utils.html b/api_docs/audiocraft/data/audio_utils.html
new file mode 100644
index 00000000..b48d6db8
--- /dev/null
+++ b/api_docs/audiocraft/data/audio_utils.html
@@ -0,0 +1,528 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio_utils API documentation</title>
+<meta name="description" content="Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio_utils</code></h1>
+</header>
+<section id="section-intro">
+<p>Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization.&#34;&#34;&#34;
+import sys
+import typing as tp
+
+import julius
+import torch
+import torchaudio
+
+
+def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to the given number of channels.
+
+    Args:
+        wav (torch.Tensor): Audio wave of shape [B, C, T].
+        channels (int): Expected number of channels as output.
+    Returns:
+        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
+    &#34;&#34;&#34;
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, and the stream has multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file has
+        # a single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels &gt;= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file has
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError(&#39;The audio file has less channels than requested but is not mono.&#39;)
+    return wav
+
+
+def convert_audio(wav: torch.Tensor, from_rate: float,
+                  to_rate: float, to_channels: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to new sample rate and number of audio channels.&#34;&#34;&#34;
+    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
+    wav = convert_audio_channels(wav, to_channels)
+    return wav
+
+
+def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    &#34;&#34;&#34;Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        torch.Tensor: Loudness normalized output data.
+    &#34;&#34;&#34;
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy &lt; energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output
+
+
+def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -&gt; None:
+    &#34;&#34;&#34;Utility function to clip the audio with logging if specified.&#34;&#34;&#34;
+    max_scale = wav.abs().max()
+    if log_clipping and max_scale &gt; 1:
+        clamp_prob = (wav.abs() &gt; 1).float().mean().item()
+        print(f&#34;CLIPPING {stem_name or &#39;&#39;} happening with proba (a bit of clipping is okay):&#34;,
+              clamp_prob, &#34;maximum scale: &#34;, max_scale.item(), file=sys.stderr)
+    wav.clamp_(-1, 1)
+
+
+def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the audio according to the prescribed strategy (see after).
+
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (str, optional): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    &#34;&#34;&#34;
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == &#39;peak&#39;:
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+    elif strategy == &#39;clip&#39;:
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == &#39;rms&#39;:
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == &#39;loudness&#39;:
+        assert sample_rate is not None, &#34;Loudness normalization requires sample rate.&#34;
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() &lt; 1
+        assert strategy == &#39;&#39; or strategy == &#39;none&#39;, f&#34;Unexpected strategy: &#39;{strategy}&#39;&#34;
+    return wav
+
+
+def f32_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to float 32 bits PCM format.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        return wav
+    elif wav.dtype == torch.int16:
+        return wav.float() / 2**15
+    elif wav.dtype == torch.int32:
+        return wav.float() / 2**31
+    raise ValueError(f&#34;Unsupported wav dtype: {wav.dtype}&#34;)
+
+
+def i16_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to int 16 bits PCM format.
+
+    ..Warning:: There exist many formula for doing this conversion. None are perfect
+    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistencies with f32_pcm. If the given wav doesn&#39;t have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() &lt;= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() &gt;= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio_utils.convert_audio"><code class="name flex">
+<span>def <span class="ident">convert_audio</span></span>(<span>wav: torch.Tensor, from_rate: float, to_rate: float, to_channels: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to new sample rate and number of audio channels.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def convert_audio(wav: torch.Tensor, from_rate: float,
+                  to_rate: float, to_channels: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to new sample rate and number of audio channels.&#34;&#34;&#34;
+    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
+    wav = convert_audio_channels(wav, to_channels)
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.convert_audio_channels"><code class="name flex">
+<span>def <span class="ident">convert_audio_channels</span></span>(<span>wav: torch.Tensor, channels: int = 2) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to the given number of channels.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio wave of shape [B, C, T].</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Expected number of channels as output.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Downmixed or unchanged audio wave [B, C, T].</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to the given number of channels.
+
+    Args:
+        wav (torch.Tensor): Audio wave of shape [B, C, T].
+        channels (int): Expected number of channels as output.
+    Returns:
+        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
+    &#34;&#34;&#34;
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, and the stream has multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file has
+        # a single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels &gt;= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file has
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError(&#39;The audio file has less channels than requested but is not mono.&#39;)
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.f32_pcm"><code class="name flex">
+<span>def <span class="ident">f32_pcm</span></span>(<span>wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to float 32 bits PCM format.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def f32_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to float 32 bits PCM format.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        return wav
+    elif wav.dtype == torch.int16:
+        return wav.float() / 2**15
+    elif wav.dtype == torch.int32:
+        return wav.float() / 2**31
+    raise ValueError(f&#34;Unsupported wav dtype: {wav.dtype}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.i16_pcm"><code class="name flex">
+<span>def <span class="ident">i16_pcm</span></span>(<span>wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to int 16 bits PCM format.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;There exist many formula for doing this conversion. None are perfect</p>
+</div>
+<p>due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
+it is possible that <code>i16_pcm(f32_pcm)) != Identity</code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def i16_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to int 16 bits PCM format.
+
+    ..Warning:: There exist many formula for doing this conversion. None are perfect
+    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistencies with f32_pcm. If the given wav doesn&#39;t have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() &lt;= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() &gt;= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.normalize_audio"><code class="name flex">
+<span>def <span class="ident">normalize_audio</span></span>(<span>wav: torch.Tensor, normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1, rms_headroom_db: float = 18, loudness_headroom_db: float = 14, loudness_compressor: bool = False, log_clipping: bool = False, sample_rate: Optional[int] = None, stem_name: Optional[str] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Normalize the audio according to the prescribed strategy (see after).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio data.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code> (default), normalizes according to the prescribed
+strategy (see after). If <code>False</code>, the strategy is only used in case clipping
+would happen.</dd>
+<dt><strong><code>strategy</code></strong> :&ensp;<code>str</code></dt>
+<dd>Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+with extra headroom to avoid clipping. 'clip' just clips.</dd>
+<dt><strong><code>peak_clip_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'peak' or 'clip' strategy.</dd>
+<dt><strong><code>rms_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'rms' strategy. This must be much larger
+than the <code>peak_clip</code> one to avoid further clipping.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness for loudness normalization.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, uses tanh based soft clipping.</dd>
+<dt><strong><code>log_clipping</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, basic logging on stderr when clipping still
+occurs despite strategy (only for 'rms').</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate for the audio data (required for loudness).</dd>
+<dt><strong><code>stem_name</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Stem name for clipping logging.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Normalized audio.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the audio according to the prescribed strategy (see after).
+
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (str, optional): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    &#34;&#34;&#34;
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == &#39;peak&#39;:
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+    elif strategy == &#39;clip&#39;:
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == &#39;rms&#39;:
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == &#39;loudness&#39;:
+        assert sample_rate is not None, &#34;Loudness normalization requires sample rate.&#34;
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() &lt; 1
+        assert strategy == &#39;&#39; or strategy == &#39;none&#39;, f&#34;Unexpected strategy: &#39;{strategy}&#39;&#34;
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.normalize_loudness"><code class="name flex">
+<span>def <span class="ident">normalize_loudness</span></span>(<span>wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14, loudness_compressor: bool = False, energy_floor: float = 0.002)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Normalize an input signal to a user loudness in dB LKFS.
+Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input multichannel audio data.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness of the output in dB LUFS.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Uses tanh for soft clipping.</dd>
+<dt><strong><code>energy_floor</code></strong> :&ensp;<code>float</code></dt>
+<dd>anything below that RMS level will not be rescaled.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Loudness normalized output data.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    &#34;&#34;&#34;Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        torch.Tensor: Loudness normalized output data.
+    &#34;&#34;&#34;
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy &lt; energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_utils.convert_audio" href="#audiocraft.data.audio_utils.convert_audio">convert_audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.convert_audio_channels" href="#audiocraft.data.audio_utils.convert_audio_channels">convert_audio_channels</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.f32_pcm" href="#audiocraft.data.audio_utils.f32_pcm">f32_pcm</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.i16_pcm" href="#audiocraft.data.audio_utils.i16_pcm">i16_pcm</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.normalize_audio" href="#audiocraft.data.audio_utils.normalize_audio">normalize_audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.normalize_loudness" href="#audiocraft.data.audio_utils.normalize_loudness">normalize_loudness</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/index.html b/api_docs/audiocraft/data/index.html
new file mode 100644
index 00000000..525d13b5
--- /dev/null
+++ b/api_docs/audiocraft/data/index.html
@@ -0,0 +1,118 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data API documentation</title>
+<meta name="description" content="Audio loading and writing support. Datasets for raw audio
+or also including some metadata." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data</code></h1>
+</header>
+<section id="section-intro">
+<p>Audio loading and writing support. Datasets for raw audio
+or also including some metadata.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Audio loading and writing support. Datasets for raw audio
+or also including some metadata.&#34;&#34;&#34;
+
+# flake8: noqa
+from . import audio, audio_dataset, info_audio_dataset, music_dataset, sound_dataset</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.data.audio" href="audio.html">audiocraft.data.audio</a></code></dt>
+<dd>
+<div class="desc"><p>Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.audio_dataset" href="audio_dataset.html">audiocraft.data.audio_dataset</a></code></dt>
+<dd>
+<div class="desc"><p>AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.audio_utils" href="audio_utils.html">audiocraft.data.audio_utils</a></code></dt>
+<dd>
+<div class="desc"><p>Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.info_audio_dataset" href="info_audio_dataset.html">audiocraft.data.info_audio_dataset</a></code></dt>
+<dd>
+<div class="desc"><p>Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.music_dataset" href="music_dataset.html">audiocraft.data.music_dataset</a></code></dt>
+<dd>
+<div class="desc"><p>Dataset of music tracks with rich metadata.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.sound_dataset" href="sound_dataset.html">audiocraft.data.sound_dataset</a></code></dt>
+<dd>
+<div class="desc"><p>Dataset of audio with a simple description.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.zip" href="zip.html">audiocraft.data.zip</a></code></dt>
+<dd>
+<div class="desc"><p>Utility for reading some info from inside a zip file.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.data.audio" href="audio.html">audiocraft.data.audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset" href="audio_dataset.html">audiocraft.data.audio_dataset</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils" href="audio_utils.html">audiocraft.data.audio_utils</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset" href="info_audio_dataset.html">audiocraft.data.info_audio_dataset</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset" href="music_dataset.html">audiocraft.data.music_dataset</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset" href="sound_dataset.html">audiocraft.data.sound_dataset</a></code></li>
+<li><code><a title="audiocraft.data.zip" href="zip.html">audiocraft.data.zip</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/info_audio_dataset.html b/api_docs/audiocraft/data/info_audio_dataset.html
new file mode 100644
index 00000000..c0b269b3
--- /dev/null
+++ b/api_docs/audiocraft/data/info_audio_dataset.html
@@ -0,0 +1,402 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.info_audio_dataset API documentation</title>
+<meta name="description" content="Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.info_audio_dataset</code></h1>
+</header>
+<section id="section-intro">
+<p>Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc.
+&#34;&#34;&#34;
+from dataclasses import dataclass
+import logging
+import math
+import re
+import typing as tp
+
+import torch
+
+from .audio_dataset import AudioDataset, AudioMeta
+from ..environment import AudioCraftEnvironment
+from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
+
+
+logger = logging.getLogger(__name__)
+
+
+def _clusterify_meta(meta: AudioMeta) -&gt; AudioMeta:
+    &#34;&#34;&#34;Monkey-patch meta to match cluster specificities.&#34;&#34;&#34;
+    meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
+    if meta.info_path is not None:
+        meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
+    return meta
+
+
+def clusterify_all_meta(meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Monkey-patch all meta to match cluster specificities.&#34;&#34;&#34;
+    return [_clusterify_meta(m) for m in meta]
+
+
+@dataclass
+class AudioInfo(SegmentWithAttributes):
+    &#34;&#34;&#34;Dummy SegmentInfo with empty attributes.
+
+    The InfoAudioDataset is expected to return metadata that inherits
+    from SegmentWithAttributes class and can return conditioning attributes.
+
+    This basically guarantees all datasets will be compatible with current
+    solver that contain conditioners requiring this.
+    &#34;&#34;&#34;
+    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        return ConditioningAttributes()
+
+
+class InfoAudioDataset(AudioDataset):
+    &#34;&#34;&#34;AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
+
+    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
+        super().__init__(clusterify_all_meta(meta), **kwargs)
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
+        if not self.return_info:
+            wav = super().__getitem__(index)
+            assert isinstance(wav, torch.Tensor)
+            return wav
+        wav, meta = super().__getitem__(index)
+        return wav, AudioInfo(**meta.to_dict())
+
+
+def get_keyword_or_keyword_list(value: tp.Optional[str]) -&gt; tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
+    &#34;&#34;&#34;Preprocess a single keyword or possible a list of keywords.&#34;&#34;&#34;
+    if isinstance(value, list):
+        return get_keyword_list(value)
+    else:
+        return get_keyword(value)
+
+
+def get_string(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess a single keyword.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    else:
+        return value.strip()
+
+
+def get_keyword(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess a single keyword.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    else:
+        return value.strip().lower()
+
+
+def get_keyword_list(values: tp.Union[str, tp.List[str]]) -&gt; tp.Optional[tp.List[str]]:
+    &#34;&#34;&#34;Preprocess a list of keywords.&#34;&#34;&#34;
+    if isinstance(values, str):
+        values = [v.strip() for v in re.split(r&#39;[,\s]&#39;, values)]
+    elif isinstance(values, float) and math.isnan(values):
+        values = []
+    if not isinstance(values, list):
+        logger.debug(f&#34;Unexpected keyword list {values}&#34;)
+        values = [str(values)]
+
+    kws = [get_keyword(v) for v in values]
+    kw_list = [k for k in kws if k is not None]
+    if len(kw_list) == 0:
+        return None
+    else:
+        return kw_list</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.info_audio_dataset.clusterify_all_meta"><code class="name flex">
+<span>def <span class="ident">clusterify_all_meta</span></span>(<span>meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]) ‑> List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Monkey-patch all meta to match cluster specificities.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def clusterify_all_meta(meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Monkey-patch all meta to match cluster specificities.&#34;&#34;&#34;
+    return [_clusterify_meta(m) for m in meta]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.info_audio_dataset.get_keyword"><code class="name flex">
+<span>def <span class="ident">get_keyword</span></span>(<span>value: Optional[str]) ‑> Optional[str]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess a single keyword.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_keyword(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess a single keyword.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    else:
+        return value.strip().lower()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.info_audio_dataset.get_keyword_list"><code class="name flex">
+<span>def <span class="ident">get_keyword_list</span></span>(<span>values: Union[str, List[str]]) ‑> Optional[List[str]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess a list of keywords.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_keyword_list(values: tp.Union[str, tp.List[str]]) -&gt; tp.Optional[tp.List[str]]:
+    &#34;&#34;&#34;Preprocess a list of keywords.&#34;&#34;&#34;
+    if isinstance(values, str):
+        values = [v.strip() for v in re.split(r&#39;[,\s]&#39;, values)]
+    elif isinstance(values, float) and math.isnan(values):
+        values = []
+    if not isinstance(values, list):
+        logger.debug(f&#34;Unexpected keyword list {values}&#34;)
+        values = [str(values)]
+
+    kws = [get_keyword(v) for v in values]
+    kw_list = [k for k in kws if k is not None]
+    if len(kw_list) == 0:
+        return None
+    else:
+        return kw_list</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.info_audio_dataset.get_keyword_or_keyword_list"><code class="name flex">
+<span>def <span class="ident">get_keyword_or_keyword_list</span></span>(<span>value: Optional[str]) ‑> Union[str, None, List[str]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess a single keyword or possible a list of keywords.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_keyword_or_keyword_list(value: tp.Optional[str]) -&gt; tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
+    &#34;&#34;&#34;Preprocess a single keyword or possible a list of keywords.&#34;&#34;&#34;
+    if isinstance(value, list):
+        return get_keyword_list(value)
+    else:
+        return get_keyword(value)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.info_audio_dataset.get_string"><code class="name flex">
+<span>def <span class="ident">get_string</span></span>(<span>value: Optional[str]) ‑> Optional[str]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess a single keyword.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_string(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess a single keyword.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    else:
+        return value.strip()</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.info_audio_dataset.AudioInfo"><code class="flex name class">
+<span>class <span class="ident">AudioInfo</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int, audio_tokens: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Dummy SegmentInfo with empty attributes.</p>
+<p>The InfoAudioDataset is expected to return metadata that inherits
+from SegmentWithAttributes class and can return conditioning attributes.</p>
+<p>This basically guarantees all datasets will be compatible with current
+solver that contain conditioners requiring this.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioInfo(SegmentWithAttributes):
+    &#34;&#34;&#34;Dummy SegmentInfo with empty attributes.
+
+    The InfoAudioDataset is expected to return metadata that inherits
+    from SegmentWithAttributes class and can return conditioning attributes.
+
+    This basically guarantees all datasets will be compatible with current
+    solver that contain conditioners requiring this.
+    &#34;&#34;&#34;
+    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        return ConditioningAttributes()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></li>
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.music_dataset.MusicInfo" href="music_dataset.html#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.info_audio_dataset.AudioInfo.audio_tokens"><code class="name">var <span class="ident">audio_tokens</span> : Optional[torch.Tensor]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.info_audio_dataset.AudioInfo.to_condition_attributes"><code class="name flex">
+<span>def <span class="ident">to_condition_attributes</span></span>(<span>self) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_condition_attributes(self) -&gt; ConditioningAttributes:
+    return ConditioningAttributes()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.info_audio_dataset.InfoAudioDataset"><code class="flex name class">
+<span>class <span class="ident">InfoAudioDataset</span></span>
+<span>(</span><span>meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.</p>
+<p>See <code><a title="audiocraft.data.audio_dataset.AudioDataset" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></code> for initialization arguments.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class InfoAudioDataset(AudioDataset):
+    &#34;&#34;&#34;AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
+
+    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
+        super().__init__(clusterify_all_meta(meta), **kwargs)
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
+        if not self.return_info:
+            wav = super().__getitem__(index)
+            assert isinstance(wav, torch.Tensor)
+            return wav
+        wav, meta = super().__getitem__(index)
+        return wav, AudioInfo(**meta.to_dict())</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.AudioDataset" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.music_dataset.MusicDataset" href="music_dataset.html#audiocraft.data.music_dataset.MusicDataset">MusicDataset</a></li>
+<li><a title="audiocraft.data.sound_dataset.SoundDataset" href="sound_dataset.html#audiocraft.data.sound_dataset.SoundDataset">SoundDataset</a></li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.data.audio_dataset.AudioDataset" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.collater" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.collater">collater</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_meta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_meta">from_meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_path" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_path">from_path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.sample_file" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.sample_file">sample_file</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.info_audio_dataset.clusterify_all_meta" href="#audiocraft.data.info_audio_dataset.clusterify_all_meta">clusterify_all_meta</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.get_keyword" href="#audiocraft.data.info_audio_dataset.get_keyword">get_keyword</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.get_keyword_list" href="#audiocraft.data.info_audio_dataset.get_keyword_list">get_keyword_list</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.get_keyword_or_keyword_list" href="#audiocraft.data.info_audio_dataset.get_keyword_or_keyword_list">get_keyword_or_keyword_list</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.get_string" href="#audiocraft.data.info_audio_dataset.get_string">get_string</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.info_audio_dataset.AudioInfo" href="#audiocraft.data.info_audio_dataset.AudioInfo">AudioInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.info_audio_dataset.AudioInfo.audio_tokens" href="#audiocraft.data.info_audio_dataset.AudioInfo.audio_tokens">audio_tokens</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.AudioInfo.to_condition_attributes" href="#audiocraft.data.info_audio_dataset.AudioInfo.to_condition_attributes">to_condition_attributes</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/music_dataset.html b/api_docs/audiocraft/data/music_dataset.html
new file mode 100644
index 00000000..7c767bba
--- /dev/null
+++ b/api_docs/audiocraft/data/music_dataset.html
@@ -0,0 +1,913 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.music_dataset API documentation</title>
+<meta name="description" content="Dataset of music tracks with rich metadata." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.music_dataset</code></h1>
+</header>
+<section id="section-intro">
+<p>Dataset of music tracks with rich metadata.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Dataset of music tracks with rich metadata.
+&#34;&#34;&#34;
+from dataclasses import dataclass, field, fields, replace
+import gzip
+import json
+import logging
+from pathlib import Path
+import random
+import typing as tp
+
+import torch
+
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    AudioInfo,
+    get_keyword_list,
+    get_keyword,
+    get_string
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    JointEmbedCondition,
+    WavCondition,
+)
+from ..utils.utils import warn_once
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MusicInfo(AudioInfo):
+    &#34;&#34;&#34;Segment info augmented with music metadata.
+    &#34;&#34;&#34;
+    # music-specific metadata
+    title: tp.Optional[str] = None
+    artist: tp.Optional[str] = None  # anonymized artist id, used to ensure no overlap between splits
+    key: tp.Optional[str] = None
+    bpm: tp.Optional[float] = None
+    genre: tp.Optional[str] = None
+    moods: tp.Optional[list] = None
+    keywords: tp.Optional[list] = None
+    description: tp.Optional[str] = None
+    name: tp.Optional[str] = None
+    instrument: tp.Optional[str] = None
+    # original wav accompanying the metadata
+    self_wav: tp.Optional[WavCondition] = None
+    # dict mapping attributes names to tuple of wav, text and metadata
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+
+    @property
+    def has_music_meta(self) -&gt; bool:
+        return self.name is not None
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == &#39;self_wav&#39;:
+                out.wav[key] = value
+            elif key == &#39;joint_embed&#39;:
+                for embed_attribute, embed_cond in value.items():
+                    out.joint_embed[embed_attribute] = embed_cond
+            else:
+                if isinstance(value, list):
+                    value = &#39; &#39;.join(value)
+                out.text[key] = value
+        return out
+
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == &#39;bpm&#39;:
+            preprocess_func = get_bpm
+        elif attribute == &#39;key&#39;:
+            preprocess_func = get_musical_key
+        elif attribute in [&#39;moods&#39;, &#39;keywords&#39;]:
+            preprocess_func = get_keyword_list
+        elif attribute in [&#39;genre&#39;, &#39;name&#39;, &#39;instrument&#39;]:
+            preprocess_func = get_keyword
+        elif attribute in [&#39;title&#39;, &#39;artist&#39;, &#39;description&#39;]:
+            preprocess_func = get_string
+        else:
+            preprocess_func = None
+        return preprocess_func
+
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = [&#39;self_wav&#39;, &#39;joint_embed&#39;]
+        optional_fields = [&#39;keywords&#39;]
+
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required and _field.name not in optional_fields:
+                    raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+
+
+def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
+                                   drop_desc_p: float = 0., drop_other_p: float = 0.) -&gt; MusicInfo:
+    &#34;&#34;&#34;Augment MusicInfo description with additional metadata fields and potential dropout.
+    Additional textual attributes are added given probability &#39;merge_text_conditions_p&#39; and
+    the original textual description is dropped from the augmented description given probability drop_desc_p.
+
+    Args:
+        music_info (MusicInfo): The music metadata to augment.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+            If provided value is 0, then no merging is performed.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+            if provided value is 0, then no drop out is performed.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+    Returns:
+        MusicInfo: The MusicInfo with augmented textual description.
+    &#34;&#34;&#34;
+    def is_valid_field(field_name: str, field_value: tp.Any) -&gt; bool:
+        valid_field_name = field_name in [&#39;key&#39;, &#39;bpm&#39;, &#39;genre&#39;, &#39;moods&#39;, &#39;instrument&#39;, &#39;keywords&#39;]
+        valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
+        keep_field = random.uniform(0, 1) &lt; drop_other_p
+        return valid_field_name and valid_field_value and keep_field
+
+    def process_value(v: tp.Any) -&gt; str:
+        if isinstance(v, (int, float, str)):
+            return str(v)
+        if isinstance(v, list):
+            return &#34;, &#34;.join(v)
+        else:
+            raise ValueError(f&#34;Unknown type for text value! ({type(v), v})&#34;)
+
+    description = music_info.description
+
+    metadata_text = &#34;&#34;
+    if random.uniform(0, 1) &lt; merge_text_p:
+        meta_pairs = [f&#39;{_field.name}: {process_value(getattr(music_info, _field.name))}&#39;
+                      for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
+        random.shuffle(meta_pairs)
+        metadata_text = &#34;. &#34;.join(meta_pairs)
+        description = description if not random.uniform(0, 1) &lt; drop_desc_p else None
+        logger.debug(f&#34;Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}&#34;)
+
+    if description is None:
+        description = metadata_text if len(metadata_text) &gt; 1 else None
+    else:
+        description = &#34;. &#34;.join([description.rstrip(&#39;.&#39;), metadata_text])
+    description = description.strip() if description else None
+
+    music_info = replace(music_info)
+    music_info.description = description
+    return music_info
+
+
+class Paraphraser:
+    def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
+        self.paraphrase_p = paraphrase_p
+        open_fn = gzip.open if str(paraphrase_source).lower().endswith(&#39;.gz&#39;) else open
+        with open_fn(paraphrase_source, &#39;rb&#39;) as f:  # type: ignore
+            self.paraphrase_source = json.loads(f.read())
+        logger.info(f&#34;loaded paraphrasing source from: {paraphrase_source}&#34;)
+
+    def sample_paraphrase(self, audio_path: str, description: str):
+        if random.random() &gt;= self.paraphrase_p:
+            return description
+        info_path = Path(audio_path).with_suffix(&#39;.json&#39;)
+        if info_path not in self.paraphrase_source:
+            warn_once(logger, f&#34;{info_path} not in paraphrase source!&#34;)
+            return description
+        new_desc = random.choice(self.paraphrase_source[info_path])
+        logger.debug(f&#34;{description} -&gt; {new_desc}&#34;)
+        return new_desc
+
+
+class MusicDataset(InfoAudioDataset):
+    &#34;&#34;&#34;Music dataset is an AudioDataset with music-related metadata.
+
+    Args:
+        info_fields_required (bool): Whether to enforce having required fields.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+        joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
+        paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
+            paraphrases for the description. The json should be a dict with keys are the
+            original info path (e.g. track_path.json) and each value is a list of possible
+            paraphrased.
+        paraphrase_p (float): probability of taking a paraphrase.
+
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(self, *args, info_fields_required: bool = True,
+                 merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
+                 joint_embed_attributes: tp.List[str] = [],
+                 paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
+                 **kwargs):
+        kwargs[&#39;return_info&#39;] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.merge_text_p = merge_text_p
+        self.drop_desc_p = drop_desc_p
+        self.drop_other_p = drop_other_p
+        self.joint_embed_attributes = joint_embed_attributes
+        self.paraphraser = None
+        if paraphrase_source is not None:
+            self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
+
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        music_info_path = Path(info.meta.path).with_suffix(&#39;.json&#39;)
+
+        if Path(music_info_path).exists():
+            with open(music_info_path, &#39;r&#39;) as json_file:
+                music_data = json.load(json_file)
+                music_data.update(info_data)
+                music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
+            if self.paraphraser is not None:
+                music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
+            if self.merge_text_p:
+                music_info = augment_music_info_description(
+                    music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
+        else:
+            music_info = MusicInfo.from_dict(info_data, fields_required=False)
+
+        music_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+
+        for att in self.joint_embed_attributes:
+            att_value = getattr(music_info, att)
+            joint_embed_cond = JointEmbedCondition(
+                wav[None], [att_value], torch.tensor([info.n_frames]),
+                sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+            music_info.joint_embed[att] = joint_embed_cond
+
+        return wav, music_info
+
+
+def get_musical_key(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess key keywords, discarding them if there are multiple key defined.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    elif &#39;,&#39; in value:
+        # For now, we discard when multiple keys are defined separated with comas
+        return None
+    else:
+        return value.strip().lower()
+
+
+def get_bpm(value: tp.Optional[str]) -&gt; tp.Optional[float]:
+    &#34;&#34;&#34;Preprocess to a float.&#34;&#34;&#34;
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.music_dataset.augment_music_info_description"><code class="name flex">
+<span>def <span class="ident">augment_music_info_description</span></span>(<span>music_info: <a title="audiocraft.data.music_dataset.MusicInfo" href="#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a>, merge_text_p: float = 0.0, drop_desc_p: float = 0.0, drop_other_p: float = 0.0) ‑> <a title="audiocraft.data.music_dataset.MusicInfo" href="#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Augment MusicInfo description with additional metadata fields and potential dropout.
+Additional textual attributes are added given probability 'merge_text_conditions_p' and
+the original textual description is dropped from the augmented description given probability drop_desc_p.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>music_info</code></strong> :&ensp;<code><a title="audiocraft.data.music_dataset.MusicInfo" href="#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a></code></dt>
+<dd>The music metadata to augment.</dd>
+<dt><strong><code>merge_text_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of merging additional metadata to the description.
+If provided value is 0, then no merging is performed.</dd>
+<dt><strong><code>drop_desc_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of dropping the original description on text merge.
+if provided value is 0, then no drop out is performed.</dd>
+<dt><strong><code>drop_other_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of dropping the other fields used for text augmentation.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.data.music_dataset.MusicInfo" href="#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a></code></dt>
+<dd>The MusicInfo with augmented textual description.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
+                                   drop_desc_p: float = 0., drop_other_p: float = 0.) -&gt; MusicInfo:
+    &#34;&#34;&#34;Augment MusicInfo description with additional metadata fields and potential dropout.
+    Additional textual attributes are added given probability &#39;merge_text_conditions_p&#39; and
+    the original textual description is dropped from the augmented description given probability drop_desc_p.
+
+    Args:
+        music_info (MusicInfo): The music metadata to augment.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+            If provided value is 0, then no merging is performed.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+            if provided value is 0, then no drop out is performed.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+    Returns:
+        MusicInfo: The MusicInfo with augmented textual description.
+    &#34;&#34;&#34;
+    def is_valid_field(field_name: str, field_value: tp.Any) -&gt; bool:
+        valid_field_name = field_name in [&#39;key&#39;, &#39;bpm&#39;, &#39;genre&#39;, &#39;moods&#39;, &#39;instrument&#39;, &#39;keywords&#39;]
+        valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
+        keep_field = random.uniform(0, 1) &lt; drop_other_p
+        return valid_field_name and valid_field_value and keep_field
+
+    def process_value(v: tp.Any) -&gt; str:
+        if isinstance(v, (int, float, str)):
+            return str(v)
+        if isinstance(v, list):
+            return &#34;, &#34;.join(v)
+        else:
+            raise ValueError(f&#34;Unknown type for text value! ({type(v), v})&#34;)
+
+    description = music_info.description
+
+    metadata_text = &#34;&#34;
+    if random.uniform(0, 1) &lt; merge_text_p:
+        meta_pairs = [f&#39;{_field.name}: {process_value(getattr(music_info, _field.name))}&#39;
+                      for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
+        random.shuffle(meta_pairs)
+        metadata_text = &#34;. &#34;.join(meta_pairs)
+        description = description if not random.uniform(0, 1) &lt; drop_desc_p else None
+        logger.debug(f&#34;Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}&#34;)
+
+    if description is None:
+        description = metadata_text if len(metadata_text) &gt; 1 else None
+    else:
+        description = &#34;. &#34;.join([description.rstrip(&#39;.&#39;), metadata_text])
+    description = description.strip() if description else None
+
+    music_info = replace(music_info)
+    music_info.description = description
+    return music_info</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.music_dataset.get_bpm"><code class="name flex">
+<span>def <span class="ident">get_bpm</span></span>(<span>value: Optional[str]) ‑> Optional[float]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess to a float.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_bpm(value: tp.Optional[str]) -&gt; tp.Optional[float]:
+    &#34;&#34;&#34;Preprocess to a float.&#34;&#34;&#34;
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.music_dataset.get_musical_key"><code class="name flex">
+<span>def <span class="ident">get_musical_key</span></span>(<span>value: Optional[str]) ‑> Optional[str]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Preprocess key keywords, discarding them if there are multiple key defined.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_musical_key(value: tp.Optional[str]) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Preprocess key keywords, discarding them if there are multiple key defined.&#34;&#34;&#34;
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == &#39;None&#39;:
+        return None
+    elif &#39;,&#39; in value:
+        # For now, we discard when multiple keys are defined separated with comas
+        return None
+    else:
+        return value.strip().lower()</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.music_dataset.MusicDataset"><code class="flex name class">
+<span>class <span class="ident">MusicDataset</span></span>
+<span>(</span><span>*args, info_fields_required: bool = True, merge_text_p: float = 0.0, drop_desc_p: float = 0.0, drop_other_p: float = 0.0, joint_embed_attributes: List[str] = [], paraphrase_source: Optional[str] = None, paraphrase_p: float = 0, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Music dataset is an AudioDataset with music-related metadata.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>info_fields_required</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to enforce having required fields.</dd>
+<dt><strong><code>merge_text_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of merging additional metadata to the description.</dd>
+<dt><strong><code>drop_desc_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of dropping the original description on text merge.</dd>
+<dt><strong><code>drop_other_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of dropping the other fields used for text augmentation.</dd>
+<dt><strong><code>joint_embed_attributes</code></strong> :&ensp;<code>list[str]</code></dt>
+<dd>A list of attributes for which joint embedding metadata is returned.</dd>
+<dt><strong><code>paraphrase_source</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Path to the .json or .json.gz file containing the
+paraphrases for the description. The json should be a dict with keys are the
+original info path (e.g. track_path.json) and each value is a list of possible
+paraphrased.</dd>
+<dt><strong><code>paraphrase_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>probability of taking a paraphrase.</dd>
+</dl>
+<p>See <code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></code> for full initialization arguments.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicDataset(InfoAudioDataset):
+    &#34;&#34;&#34;Music dataset is an AudioDataset with music-related metadata.
+
+    Args:
+        info_fields_required (bool): Whether to enforce having required fields.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+        joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
+        paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
+            paraphrases for the description. The json should be a dict with keys are the
+            original info path (e.g. track_path.json) and each value is a list of possible
+            paraphrased.
+        paraphrase_p (float): probability of taking a paraphrase.
+
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(self, *args, info_fields_required: bool = True,
+                 merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
+                 joint_embed_attributes: tp.List[str] = [],
+                 paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
+                 **kwargs):
+        kwargs[&#39;return_info&#39;] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.merge_text_p = merge_text_p
+        self.drop_desc_p = drop_desc_p
+        self.drop_other_p = drop_other_p
+        self.joint_embed_attributes = joint_embed_attributes
+        self.paraphraser = None
+        if paraphrase_source is not None:
+            self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
+
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        music_info_path = Path(info.meta.path).with_suffix(&#39;.json&#39;)
+
+        if Path(music_info_path).exists():
+            with open(music_info_path, &#39;r&#39;) as json_file:
+                music_data = json.load(json_file)
+                music_data.update(info_data)
+                music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
+            if self.paraphraser is not None:
+                music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
+            if self.merge_text_p:
+                music_info = augment_music_info_description(
+                    music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
+        else:
+            music_info = MusicInfo.from_dict(info_data, fields_required=False)
+
+        music_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+
+        for att in self.joint_embed_attributes:
+            att_value = getattr(music_info, att)
+            joint_embed_cond = JointEmbedCondition(
+                wav[None], [att_value], torch.tensor([info.n_frames]),
+                sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+            music_info.joint_embed[att] = joint_embed_cond
+
+        return wav, music_info</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></li>
+<li><a title="audiocraft.data.audio_dataset.AudioDataset" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.collater" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.collater">collater</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.from_meta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_meta">from_meta</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.from_path" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_path">from_path</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.sample_file" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.sample_file">sample_file</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo"><code class="flex name class">
+<span>class <span class="ident">MusicInfo</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int, audio_tokens: Optional[torch.Tensor] = None, title: Optional[str] = None, artist: Optional[str] = None, key: Optional[str] = None, bpm: Optional[float] = None, genre: Optional[str] = None, moods: Optional[list] = None, keywords: Optional[list] = None, description: Optional[str] = None, name: Optional[str] = None, instrument: Optional[str] = None, self_wav: Optional[<a title="audiocraft.modules.conditioners.WavCondition" href="../modules/conditioners.html#audiocraft.modules.conditioners.WavCondition">WavCondition</a>] = None, joint_embed: Dict[str, <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="../modules/conditioners.html#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a>] = &lt;factory&gt;)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Segment info augmented with music metadata.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicInfo(AudioInfo):
+    &#34;&#34;&#34;Segment info augmented with music metadata.
+    &#34;&#34;&#34;
+    # music-specific metadata
+    title: tp.Optional[str] = None
+    artist: tp.Optional[str] = None  # anonymized artist id, used to ensure no overlap between splits
+    key: tp.Optional[str] = None
+    bpm: tp.Optional[float] = None
+    genre: tp.Optional[str] = None
+    moods: tp.Optional[list] = None
+    keywords: tp.Optional[list] = None
+    description: tp.Optional[str] = None
+    name: tp.Optional[str] = None
+    instrument: tp.Optional[str] = None
+    # original wav accompanying the metadata
+    self_wav: tp.Optional[WavCondition] = None
+    # dict mapping attributes names to tuple of wav, text and metadata
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+
+    @property
+    def has_music_meta(self) -&gt; bool:
+        return self.name is not None
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == &#39;self_wav&#39;:
+                out.wav[key] = value
+            elif key == &#39;joint_embed&#39;:
+                for embed_attribute, embed_cond in value.items():
+                    out.joint_embed[embed_attribute] = embed_cond
+            else:
+                if isinstance(value, list):
+                    value = &#39; &#39;.join(value)
+                out.text[key] = value
+        return out
+
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == &#39;bpm&#39;:
+            preprocess_func = get_bpm
+        elif attribute == &#39;key&#39;:
+            preprocess_func = get_musical_key
+        elif attribute in [&#39;moods&#39;, &#39;keywords&#39;]:
+            preprocess_func = get_keyword_list
+        elif attribute in [&#39;genre&#39;, &#39;name&#39;, &#39;instrument&#39;]:
+            preprocess_func = get_keyword
+        elif attribute in [&#39;title&#39;, &#39;artist&#39;, &#39;description&#39;]:
+            preprocess_func = get_string
+        else:
+            preprocess_func = None
+        return preprocess_func
+
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = [&#39;self_wav&#39;, &#39;joint_embed&#39;]
+        optional_fields = [&#39;keywords&#39;]
+
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required and _field.name not in optional_fields:
+                    raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.info_audio_dataset.AudioInfo" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.AudioInfo">AudioInfo</a></li>
+<li><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></li>
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.music_dataset.MusicInfo.artist"><code class="name">var <span class="ident">artist</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.bpm"><code class="name">var <span class="ident">bpm</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.description"><code class="name">var <span class="ident">description</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.genre"><code class="name">var <span class="ident">genre</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.instrument"><code class="name">var <span class="ident">instrument</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.joint_embed"><code class="name">var <span class="ident">joint_embed</span> : Dict[str, <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="../modules/conditioners.html#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.key"><code class="name">var <span class="ident">key</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.keywords"><code class="name">var <span class="ident">keywords</span> : Optional[list]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.moods"><code class="name">var <span class="ident">moods</span> : Optional[list]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.name"><code class="name">var <span class="ident">name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.self_wav"><code class="name">var <span class="ident">self_wav</span> : Optional[<a title="audiocraft.modules.conditioners.WavCondition" href="../modules/conditioners.html#audiocraft.modules.conditioners.WavCondition">WavCondition</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.title"><code class="name">var <span class="ident">title</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.music_dataset.MusicInfo.attribute_getter"><code class="name flex">
+<span>def <span class="ident">attribute_getter</span></span>(<span>attribute)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def attribute_getter(attribute):
+    if attribute == &#39;bpm&#39;:
+        preprocess_func = get_bpm
+    elif attribute == &#39;key&#39;:
+        preprocess_func = get_musical_key
+    elif attribute in [&#39;moods&#39;, &#39;keywords&#39;]:
+        preprocess_func = get_keyword_list
+    elif attribute in [&#39;genre&#39;, &#39;name&#39;, &#39;instrument&#39;]:
+        preprocess_func = get_keyword
+    elif attribute in [&#39;title&#39;, &#39;artist&#39;, &#39;description&#39;]:
+        preprocess_func = get_string
+    else:
+        preprocess_func = None
+    return preprocess_func</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.music_dataset.MusicInfo.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict, fields_required: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict, fields_required: bool = False):
+    _dictionary: tp.Dict[str, tp.Any] = {}
+
+    # allow a subset of attributes to not be loaded from the dictionary
+    # these attributes may be populated later
+    post_init_attributes = [&#39;self_wav&#39;, &#39;joint_embed&#39;]
+    optional_fields = [&#39;keywords&#39;]
+
+    for _field in fields(cls):
+        if _field.name in post_init_attributes:
+            continue
+        elif _field.name not in dictionary:
+            if fields_required and _field.name not in optional_fields:
+                raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+        else:
+            preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+            value = dictionary[_field.name]
+            if preprocess_func:
+                value = preprocess_func(value)
+            _dictionary[_field.name] = value
+    return cls(**_dictionary)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.data.music_dataset.MusicInfo.has_music_meta"><code class="name">var <span class="ident">has_music_meta</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def has_music_meta(self) -&gt; bool:
+    return self.name is not None</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.music_dataset.MusicInfo.to_condition_attributes"><code class="name flex">
+<span>def <span class="ident">to_condition_attributes</span></span>(<span>self) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_condition_attributes(self) -&gt; ConditioningAttributes:
+    out = ConditioningAttributes()
+    for _field in fields(self):
+        key, value = _field.name, getattr(self, _field.name)
+        if key == &#39;self_wav&#39;:
+            out.wav[key] = value
+        elif key == &#39;joint_embed&#39;:
+            for embed_attribute, embed_cond in value.items():
+                out.joint_embed[embed_attribute] = embed_cond
+        else:
+            if isinstance(value, list):
+                value = &#39; &#39;.join(value)
+            out.text[key] = value
+    return out</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.music_dataset.Paraphraser"><code class="flex name class">
+<span>class <span class="ident">Paraphraser</span></span>
+<span>(</span><span>paraphrase_source: Union[str, pathlib.Path], paraphrase_p: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Paraphraser:
+    def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
+        self.paraphrase_p = paraphrase_p
+        open_fn = gzip.open if str(paraphrase_source).lower().endswith(&#39;.gz&#39;) else open
+        with open_fn(paraphrase_source, &#39;rb&#39;) as f:  # type: ignore
+            self.paraphrase_source = json.loads(f.read())
+        logger.info(f&#34;loaded paraphrasing source from: {paraphrase_source}&#34;)
+
+    def sample_paraphrase(self, audio_path: str, description: str):
+        if random.random() &gt;= self.paraphrase_p:
+            return description
+        info_path = Path(audio_path).with_suffix(&#39;.json&#39;)
+        if info_path not in self.paraphrase_source:
+            warn_once(logger, f&#34;{info_path} not in paraphrase source!&#34;)
+            return description
+        new_desc = random.choice(self.paraphrase_source[info_path])
+        logger.debug(f&#34;{description} -&gt; {new_desc}&#34;)
+        return new_desc</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.music_dataset.Paraphraser.sample_paraphrase"><code class="name flex">
+<span>def <span class="ident">sample_paraphrase</span></span>(<span>self, audio_path: str, description: str)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_paraphrase(self, audio_path: str, description: str):
+    if random.random() &gt;= self.paraphrase_p:
+        return description
+    info_path = Path(audio_path).with_suffix(&#39;.json&#39;)
+    if info_path not in self.paraphrase_source:
+        warn_once(logger, f&#34;{info_path} not in paraphrase source!&#34;)
+        return description
+    new_desc = random.choice(self.paraphrase_source[info_path])
+    logger.debug(f&#34;{description} -&gt; {new_desc}&#34;)
+    return new_desc</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.music_dataset.augment_music_info_description" href="#audiocraft.data.music_dataset.augment_music_info_description">augment_music_info_description</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.get_bpm" href="#audiocraft.data.music_dataset.get_bpm">get_bpm</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.get_musical_key" href="#audiocraft.data.music_dataset.get_musical_key">get_musical_key</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.music_dataset.MusicDataset" href="#audiocraft.data.music_dataset.MusicDataset">MusicDataset</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.music_dataset.MusicInfo" href="#audiocraft.data.music_dataset.MusicInfo">MusicInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.artist" href="#audiocraft.data.music_dataset.MusicInfo.artist">artist</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.attribute_getter" href="#audiocraft.data.music_dataset.MusicInfo.attribute_getter">attribute_getter</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.bpm" href="#audiocraft.data.music_dataset.MusicInfo.bpm">bpm</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.description" href="#audiocraft.data.music_dataset.MusicInfo.description">description</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.from_dict" href="#audiocraft.data.music_dataset.MusicInfo.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.genre" href="#audiocraft.data.music_dataset.MusicInfo.genre">genre</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.has_music_meta" href="#audiocraft.data.music_dataset.MusicInfo.has_music_meta">has_music_meta</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.instrument" href="#audiocraft.data.music_dataset.MusicInfo.instrument">instrument</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.joint_embed" href="#audiocraft.data.music_dataset.MusicInfo.joint_embed">joint_embed</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.key" href="#audiocraft.data.music_dataset.MusicInfo.key">key</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.keywords" href="#audiocraft.data.music_dataset.MusicInfo.keywords">keywords</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.moods" href="#audiocraft.data.music_dataset.MusicInfo.moods">moods</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.name" href="#audiocraft.data.music_dataset.MusicInfo.name">name</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.self_wav" href="#audiocraft.data.music_dataset.MusicInfo.self_wav">self_wav</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.title" href="#audiocraft.data.music_dataset.MusicInfo.title">title</a></code></li>
+<li><code><a title="audiocraft.data.music_dataset.MusicInfo.to_condition_attributes" href="#audiocraft.data.music_dataset.MusicInfo.to_condition_attributes">to_condition_attributes</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.music_dataset.Paraphraser" href="#audiocraft.data.music_dataset.Paraphraser">Paraphraser</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.music_dataset.Paraphraser.sample_paraphrase" href="#audiocraft.data.music_dataset.Paraphraser.sample_paraphrase">sample_paraphrase</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/sound_dataset.html b/api_docs/audiocraft/data/sound_dataset.html
new file mode 100644
index 00000000..e89e9769
--- /dev/null
+++ b/api_docs/audiocraft/data/sound_dataset.html
@@ -0,0 +1,1005 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.sound_dataset API documentation</title>
+<meta name="description" content="Dataset of audio with a simple description." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.sound_dataset</code></h1>
+</header>
+<section id="section-intro">
+<p>Dataset of audio with a simple description.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Dataset of audio with a simple description.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass, fields, replace
+import json
+from pathlib import Path
+import random
+import typing as tp
+
+import numpy as np
+import torch
+
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    get_keyword_or_keyword_list
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    SegmentWithAttributes,
+    WavCondition,
+)
+
+
+EPS = torch.finfo(torch.float32).eps
+TARGET_LEVEL_LOWER = -35
+TARGET_LEVEL_UPPER = -15
+
+
+@dataclass
+class SoundInfo(SegmentWithAttributes):
+    &#34;&#34;&#34;Segment info augmented with Sound metadata.
+    &#34;&#34;&#34;
+    description: tp.Optional[str] = None
+    self_wav: tp.Optional[torch.Tensor] = None
+
+    @property
+    def has_sound_meta(self) -&gt; bool:
+        return self.description is not None
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        out = ConditioningAttributes()
+
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == &#39;self_wav&#39;:
+                out.wav[key] = value
+            else:
+                out.text[key] = value
+        return out
+
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == &#39;description&#39;:
+            preprocess_func = get_keyword_or_keyword_list
+        else:
+            preprocess_func = None
+        return preprocess_func
+
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = [&#39;self_wav&#39;]
+
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required:
+                    raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+
+
+class SoundDataset(InfoAudioDataset):
+    &#34;&#34;&#34;Sound audio dataset: Audio dataset with environmental sound-specific metadata.
+
+    Args:
+        info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
+        external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
+            The metadata files contained in this folder are expected to match the stem of the audio file with
+            a json extension.
+        aug_p (float): Probability of performing audio mixing augmentation on the batch.
+        mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
+        mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
+        mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
+        mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
+        kwargs: Additional arguments for AudioDataset.
+
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        *args,
+        info_fields_required: bool = True,
+        external_metadata_source: tp.Optional[str] = None,
+        aug_p: float = 0.,
+        mix_p: float = 0.,
+        mix_snr_low: int = -5,
+        mix_snr_high: int = 5,
+        mix_min_overlap: float = 0.5,
+        **kwargs
+    ):
+        kwargs[&#39;return_info&#39;] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.external_metadata_source = external_metadata_source
+        self.aug_p = aug_p
+        self.mix_p = mix_p
+        if self.aug_p &gt; 0:
+            assert self.mix_p &gt; 0, &#34;Expecting some mixing proportion mix_p if aug_p &gt; 0&#34;
+            assert self.channels == 1, &#34;SoundDataset with audio mixing considers only monophonic audio&#34;
+        self.mix_snr_low = mix_snr_low
+        self.mix_snr_high = mix_snr_high
+        self.mix_min_overlap = mix_min_overlap
+
+    def _get_info_path(self, path: tp.Union[str, Path]) -&gt; Path:
+        &#34;&#34;&#34;Get path of JSON with metadata (description, etc.).
+        If there exists a JSON with the same name as &#39;path.name&#39;, then it will be used.
+        Else, such JSON will be searched for in an external json source folder if it exists.
+        &#34;&#34;&#34;
+        info_path = Path(path).with_suffix(&#39;.json&#39;)
+        if Path(info_path).exists():
+            return info_path
+        elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
+            return Path(self.external_metadata_source) / info_path.name
+        else:
+            raise Exception(f&#34;Unable to find a metadata JSON for path: {path}&#34;)
+
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        info_path = self._get_info_path(info.meta.path)
+        if Path(info_path).exists():
+            with open(info_path, &#39;r&#39;) as json_file:
+                sound_data = json.load(json_file)
+                sound_data.update(info_data)
+                sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
+                # if there are multiple descriptions, sample one randomly
+                if isinstance(sound_info.description, list):
+                    sound_info.description = random.choice(sound_info.description)
+        else:
+            sound_info = SoundInfo.from_dict(info_data, fields_required=False)
+
+        sound_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+
+        return wav, sound_info
+
+    def collater(self, samples):
+        # when training, audio mixing is performed in the collate function
+        wav, sound_info = super().collater(samples)  # SoundDataset always returns infos
+        if self.aug_p &gt; 0:
+            wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
+                                          snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
+                                          min_overlap=self.mix_min_overlap)
+        return wav, sound_info
+
+
+def rms_f(x: torch.Tensor) -&gt; torch.Tensor:
+    return (x ** 2).mean(1).pow(0.5)
+
+
+def normalize(audio: torch.Tensor, target_level: int = -25) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the signal to the target level.&#34;&#34;&#34;
+    rms = rms_f(audio)
+    scalar = 10 ** (target_level / 20) / (rms + EPS)
+    audio = audio * scalar.unsqueeze(1)
+    return audio
+
+
+def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -&gt; torch.Tensor:
+    return (abs(audio) &gt; clipping_threshold).any(1)
+
+
+def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -&gt; torch.Tensor:
+    start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
+    remainder = src.shape[1] - start
+    if dst.shape[1] &gt; remainder:
+        src[:, start:] = src[:, start:] + dst[:, :remainder]
+    else:
+        src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
+    return src
+
+
+def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
+              target_level: int = -25, clipping_threshold: float = 0.99) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Function to mix clean speech and noise at various SNR levels.
+
+    Args:
+        clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
+        noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
+        snr (int): SNR level when mixing.
+        min_overlap (float): Minimum overlap between the two mixed sources.
+        target_level (int): Gain level in dB.
+        clipping_threshold (float): Threshold for clipping the audio.
+    Returns:
+        torch.Tensor: The mixed audio, of shape [B, T].
+    &#34;&#34;&#34;
+    if clean.shape[1] &gt; noise.shape[1]:
+        noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
+    else:
+        noise = noise[:, :clean.shape[1]]
+
+    # normalizing to -25 dB FS
+    clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
+    clean = normalize(clean, target_level)
+    rmsclean = rms_f(clean)
+
+    noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
+    noise = normalize(noise, target_level)
+    rmsnoise = rms_f(noise)
+
+    # set the noise level for a given SNR
+    noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
+    noisenewlevel = noise * noisescalar
+
+    # mix noise and clean speech
+    noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
+
+    # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
+    # there is a chance of clipping that might happen with very less probability, which is not a major issue.
+    noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
+    rmsnoisy = rms_f(noisyspeech)
+    scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
+    noisyspeech = noisyspeech * scalarnoisy
+    clean = clean * scalarnoisy
+    noisenewlevel = noisenewlevel * scalarnoisy
+
+    # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
+    clipped = is_clipped(noisyspeech)
+    if clipped.any():
+        noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
+        noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
+
+    return noisyspeech
+
+
+def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
+    if snr_low == snr_high:
+        snr = snr_low
+    else:
+        snr = np.random.randint(snr_low, snr_high)
+    mix = snr_mixer(src, dst, snr, min_overlap)
+    return mix
+
+
+def mix_text(src_text: str, dst_text: str):
+    &#34;&#34;&#34;Mix text from different sources by concatenating them.&#34;&#34;&#34;
+    if src_text == dst_text:
+        return src_text
+    return src_text + &#34; &#34; + dst_text
+
+
+def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
+                snr_low: int, snr_high: int, min_overlap: float):
+    &#34;&#34;&#34;Mix samples within a batch, summing the waveforms and concatenating the text infos.
+
+    Args:
+        wavs (torch.Tensor): Audio tensors of shape [B, C, T].
+        infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
+        aug_p (float): Augmentation probability.
+        mix_p (float): Proportion of items in the batch to mix (and merge) together.
+        snr_low (int): Lowerbound for sampling SNR.
+        snr_high (int): Upperbound for sampling SNR.
+        min_overlap (float): Minimum overlap between mixed samples.
+    Returns:
+        tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
+            and mixed SoundInfo for the given batch.
+    &#34;&#34;&#34;
+    # no mixing to perform within the batch
+    if mix_p == 0:
+        return wavs, infos
+
+    if random.uniform(0, 1) &lt; aug_p:
+        # perform all augmentations on waveforms as [B, T]
+        # randomly picking pairs of audio to mix
+        assert wavs.size(1) == 1, f&#34;Mix samples requires monophonic audio but C={wavs.size(1)}&#34;
+        wavs = wavs.mean(dim=1, keepdim=False)
+        B, T = wavs.shape
+        k = int(mix_p * B)
+        mixed_sources_idx = torch.randperm(B)[:k]
+        mixed_targets_idx = torch.randperm(B)[:k]
+        aug_wavs = snr_mix(
+            wavs[mixed_sources_idx],
+            wavs[mixed_targets_idx],
+            snr_low,
+            snr_high,
+            min_overlap,
+        )
+        # mixing textual descriptions in metadata
+        descriptions = [info.description for info in infos]
+        aug_infos = []
+        for i, j in zip(mixed_sources_idx, mixed_targets_idx):
+            text = mix_text(descriptions[i], descriptions[j])
+            m = replace(infos[i])
+            m.description = text
+            aug_infos.append(m)
+
+        # back to [B, C, T]
+        aug_wavs = aug_wavs.unsqueeze(1)
+        assert aug_wavs.shape[0] &gt; 0, &#34;Samples mixing returned empty batch.&#34;
+        assert aug_wavs.dim() == 3, f&#34;Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}&#34;
+        assert aug_wavs.shape[0] == len(aug_infos), &#34;Mismatch between number of wavs and infos in the batch&#34;
+
+        return aug_wavs, aug_infos  # [B, C, T]
+    else:
+        # randomly pick samples in the batch to match
+        # the batch size when performing audio mixing
+        B, C, T = wavs.shape
+        k = int(mix_p * B)
+        wav_idx = torch.randperm(B)[:k]
+        wavs = wavs[wav_idx]
+        infos = [infos[i] for i in wav_idx]
+        assert wavs.shape[0] == len(infos), &#34;Mismatch between number of wavs and infos in the batch&#34;
+
+        return wavs, infos  # [B, C, T]</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.sound_dataset.is_clipped"><code class="name flex">
+<span>def <span class="ident">is_clipped</span></span>(<span>audio: torch.Tensor, clipping_threshold: float = 0.99) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -&gt; torch.Tensor:
+    return (abs(audio) &gt; clipping_threshold).any(1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.mix_pair"><code class="name flex">
+<span>def <span class="ident">mix_pair</span></span>(<span>src: torch.Tensor, dst: torch.Tensor, min_overlap: float) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -&gt; torch.Tensor:
+    start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
+    remainder = src.shape[1] - start
+    if dst.shape[1] &gt; remainder:
+        src[:, start:] = src[:, start:] + dst[:, :remainder]
+    else:
+        src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
+    return src</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.mix_samples"><code class="name flex">
+<span>def <span class="ident">mix_samples</span></span>(<span>wavs: torch.Tensor, infos: List[<a title="audiocraft.data.sound_dataset.SoundInfo" href="#audiocraft.data.sound_dataset.SoundInfo">SoundInfo</a>], aug_p: float, mix_p: float, snr_low: int, snr_high: int, min_overlap: float)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Mix samples within a batch, summing the waveforms and concatenating the text infos.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wavs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio tensors of shape [B, C, T].</dd>
+<dt><strong><code>infos</code></strong> :&ensp;<code>list[<a title="audiocraft.data.sound_dataset.SoundInfo" href="#audiocraft.data.sound_dataset.SoundInfo">SoundInfo</a>]</code></dt>
+<dd>List of SoundInfo items corresponding to the audio.</dd>
+<dt><strong><code>aug_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Augmentation probability.</dd>
+<dt><strong><code>mix_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Proportion of items in the batch to mix (and merge) together.</dd>
+<dt><strong><code>snr_low</code></strong> :&ensp;<code>int</code></dt>
+<dd>Lowerbound for sampling SNR.</dd>
+<dt><strong><code>snr_high</code></strong> :&ensp;<code>int</code></dt>
+<dd>Upperbound for sampling SNR.</dd>
+<dt><strong><code>min_overlap</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum overlap between mixed samples.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tuple[torch.Tensor, list[<a title="audiocraft.data.sound_dataset.SoundInfo" href="#audiocraft.data.sound_dataset.SoundInfo">SoundInfo</a>]]</code></dt>
+<dd>A tuple containing the mixed wavs
+and mixed SoundInfo for the given batch.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
+                snr_low: int, snr_high: int, min_overlap: float):
+    &#34;&#34;&#34;Mix samples within a batch, summing the waveforms and concatenating the text infos.
+
+    Args:
+        wavs (torch.Tensor): Audio tensors of shape [B, C, T].
+        infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
+        aug_p (float): Augmentation probability.
+        mix_p (float): Proportion of items in the batch to mix (and merge) together.
+        snr_low (int): Lowerbound for sampling SNR.
+        snr_high (int): Upperbound for sampling SNR.
+        min_overlap (float): Minimum overlap between mixed samples.
+    Returns:
+        tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
+            and mixed SoundInfo for the given batch.
+    &#34;&#34;&#34;
+    # no mixing to perform within the batch
+    if mix_p == 0:
+        return wavs, infos
+
+    if random.uniform(0, 1) &lt; aug_p:
+        # perform all augmentations on waveforms as [B, T]
+        # randomly picking pairs of audio to mix
+        assert wavs.size(1) == 1, f&#34;Mix samples requires monophonic audio but C={wavs.size(1)}&#34;
+        wavs = wavs.mean(dim=1, keepdim=False)
+        B, T = wavs.shape
+        k = int(mix_p * B)
+        mixed_sources_idx = torch.randperm(B)[:k]
+        mixed_targets_idx = torch.randperm(B)[:k]
+        aug_wavs = snr_mix(
+            wavs[mixed_sources_idx],
+            wavs[mixed_targets_idx],
+            snr_low,
+            snr_high,
+            min_overlap,
+        )
+        # mixing textual descriptions in metadata
+        descriptions = [info.description for info in infos]
+        aug_infos = []
+        for i, j in zip(mixed_sources_idx, mixed_targets_idx):
+            text = mix_text(descriptions[i], descriptions[j])
+            m = replace(infos[i])
+            m.description = text
+            aug_infos.append(m)
+
+        # back to [B, C, T]
+        aug_wavs = aug_wavs.unsqueeze(1)
+        assert aug_wavs.shape[0] &gt; 0, &#34;Samples mixing returned empty batch.&#34;
+        assert aug_wavs.dim() == 3, f&#34;Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}&#34;
+        assert aug_wavs.shape[0] == len(aug_infos), &#34;Mismatch between number of wavs and infos in the batch&#34;
+
+        return aug_wavs, aug_infos  # [B, C, T]
+    else:
+        # randomly pick samples in the batch to match
+        # the batch size when performing audio mixing
+        B, C, T = wavs.shape
+        k = int(mix_p * B)
+        wav_idx = torch.randperm(B)[:k]
+        wavs = wavs[wav_idx]
+        infos = [infos[i] for i in wav_idx]
+        assert wavs.shape[0] == len(infos), &#34;Mismatch between number of wavs and infos in the batch&#34;
+
+        return wavs, infos  # [B, C, T]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.mix_text"><code class="name flex">
+<span>def <span class="ident">mix_text</span></span>(<span>src_text: str, dst_text: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Mix text from different sources by concatenating them.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def mix_text(src_text: str, dst_text: str):
+    &#34;&#34;&#34;Mix text from different sources by concatenating them.&#34;&#34;&#34;
+    if src_text == dst_text:
+        return src_text
+    return src_text + &#34; &#34; + dst_text</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.normalize"><code class="name flex">
+<span>def <span class="ident">normalize</span></span>(<span>audio: torch.Tensor, target_level: int = -25) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Normalize the signal to the target level.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def normalize(audio: torch.Tensor, target_level: int = -25) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the signal to the target level.&#34;&#34;&#34;
+    rms = rms_f(audio)
+    scalar = 10 ** (target_level / 20) / (rms + EPS)
+    audio = audio * scalar.unsqueeze(1)
+    return audio</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.rms_f"><code class="name flex">
+<span>def <span class="ident">rms_f</span></span>(<span>x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def rms_f(x: torch.Tensor) -&gt; torch.Tensor:
+    return (x ** 2).mean(1).pow(0.5)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.snr_mix"><code class="name flex">
+<span>def <span class="ident">snr_mix</span></span>(<span>src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
+    if snr_low == snr_high:
+        snr = snr_low
+    else:
+        snr = np.random.randint(snr_low, snr_high)
+    mix = snr_mixer(src, dst, snr, min_overlap)
+    return mix</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.snr_mixer"><code class="name flex">
+<span>def <span class="ident">snr_mixer</span></span>(<span>clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float, target_level: int = -25, clipping_threshold: float = 0.99) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Function to mix clean speech and noise at various SNR levels.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>clean</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Clean audio source to mix, of shape [B, T].</dd>
+<dt><strong><code>noise</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Noise audio source to mix, of shape [B, T].</dd>
+<dt><strong><code>snr</code></strong> :&ensp;<code>int</code></dt>
+<dd>SNR level when mixing.</dd>
+<dt><strong><code>min_overlap</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum overlap between the two mixed sources.</dd>
+<dt><strong><code>target_level</code></strong> :&ensp;<code>int</code></dt>
+<dd>Gain level in dB.</dd>
+<dt><strong><code>clipping_threshold</code></strong> :&ensp;<code>float</code></dt>
+<dd>Threshold for clipping the audio.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>The mixed audio, of shape [B, T].</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
+              target_level: int = -25, clipping_threshold: float = 0.99) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Function to mix clean speech and noise at various SNR levels.
+
+    Args:
+        clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
+        noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
+        snr (int): SNR level when mixing.
+        min_overlap (float): Minimum overlap between the two mixed sources.
+        target_level (int): Gain level in dB.
+        clipping_threshold (float): Threshold for clipping the audio.
+    Returns:
+        torch.Tensor: The mixed audio, of shape [B, T].
+    &#34;&#34;&#34;
+    if clean.shape[1] &gt; noise.shape[1]:
+        noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
+    else:
+        noise = noise[:, :clean.shape[1]]
+
+    # normalizing to -25 dB FS
+    clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
+    clean = normalize(clean, target_level)
+    rmsclean = rms_f(clean)
+
+    noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
+    noise = normalize(noise, target_level)
+    rmsnoise = rms_f(noise)
+
+    # set the noise level for a given SNR
+    noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
+    noisenewlevel = noise * noisescalar
+
+    # mix noise and clean speech
+    noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
+
+    # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
+    # there is a chance of clipping that might happen with very less probability, which is not a major issue.
+    noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
+    rmsnoisy = rms_f(noisyspeech)
+    scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
+    noisyspeech = noisyspeech * scalarnoisy
+    clean = clean * scalarnoisy
+    noisenewlevel = noisenewlevel * scalarnoisy
+
+    # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
+    clipped = is_clipped(noisyspeech)
+    if clipped.any():
+        noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
+        noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
+
+    return noisyspeech</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.sound_dataset.SoundDataset"><code class="flex name class">
+<span>class <span class="ident">SoundDataset</span></span>
+<span>(</span><span>*args, info_fields_required: bool = True, external_metadata_source: Optional[str] = None, aug_p: float = 0.0, mix_p: float = 0.0, mix_snr_low: int = -5, mix_snr_high: int = 5, mix_min_overlap: float = 0.5, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sound audio dataset: Audio dataset with environmental sound-specific metadata.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>info_fields_required</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether all the mandatory metadata fields should be in the loaded metadata.</dd>
+<dt><strong><code>external_metadata_source</code></strong> :&ensp;<code>tp.Optional[str]</code></dt>
+<dd>Folder containing JSON metadata for the corresponding dataset.
+The metadata files contained in this folder are expected to match the stem of the audio file with
+a json extension.</dd>
+<dt><strong><code>aug_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of performing audio mixing augmentation on the batch.</dd>
+<dt><strong><code>mix_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Proportion of batch items that are mixed together when applying audio mixing augmentation.</dd>
+<dt><strong><code>mix_snr_low</code></strong> :&ensp;<code>int</code></dt>
+<dd>Lowerbound for SNR value sampled for mixing augmentation.</dd>
+<dt><strong><code>mix_snr_high</code></strong> :&ensp;<code>int</code></dt>
+<dd>Upperbound for SNR value sampled for mixing augmentation.</dd>
+<dt><strong><code>mix_min_overlap</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum overlap between audio files when performing mixing augmentation.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional arguments for AudioDataset.</dd>
+</dl>
+<p>See <code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></code> for full initialization arguments.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SoundDataset(InfoAudioDataset):
+    &#34;&#34;&#34;Sound audio dataset: Audio dataset with environmental sound-specific metadata.
+
+    Args:
+        info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
+        external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
+            The metadata files contained in this folder are expected to match the stem of the audio file with
+            a json extension.
+        aug_p (float): Probability of performing audio mixing augmentation on the batch.
+        mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
+        mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
+        mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
+        mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
+        kwargs: Additional arguments for AudioDataset.
+
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        *args,
+        info_fields_required: bool = True,
+        external_metadata_source: tp.Optional[str] = None,
+        aug_p: float = 0.,
+        mix_p: float = 0.,
+        mix_snr_low: int = -5,
+        mix_snr_high: int = 5,
+        mix_min_overlap: float = 0.5,
+        **kwargs
+    ):
+        kwargs[&#39;return_info&#39;] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.external_metadata_source = external_metadata_source
+        self.aug_p = aug_p
+        self.mix_p = mix_p
+        if self.aug_p &gt; 0:
+            assert self.mix_p &gt; 0, &#34;Expecting some mixing proportion mix_p if aug_p &gt; 0&#34;
+            assert self.channels == 1, &#34;SoundDataset with audio mixing considers only monophonic audio&#34;
+        self.mix_snr_low = mix_snr_low
+        self.mix_snr_high = mix_snr_high
+        self.mix_min_overlap = mix_min_overlap
+
+    def _get_info_path(self, path: tp.Union[str, Path]) -&gt; Path:
+        &#34;&#34;&#34;Get path of JSON with metadata (description, etc.).
+        If there exists a JSON with the same name as &#39;path.name&#39;, then it will be used.
+        Else, such JSON will be searched for in an external json source folder if it exists.
+        &#34;&#34;&#34;
+        info_path = Path(path).with_suffix(&#39;.json&#39;)
+        if Path(info_path).exists():
+            return info_path
+        elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
+            return Path(self.external_metadata_source) / info_path.name
+        else:
+            raise Exception(f&#34;Unable to find a metadata JSON for path: {path}&#34;)
+
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        info_path = self._get_info_path(info.meta.path)
+        if Path(info_path).exists():
+            with open(info_path, &#39;r&#39;) as json_file:
+                sound_data = json.load(json_file)
+                sound_data.update(info_data)
+                sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
+                # if there are multiple descriptions, sample one randomly
+                if isinstance(sound_info.description, list):
+                    sound_info.description = random.choice(sound_info.description)
+        else:
+            sound_info = SoundInfo.from_dict(info_data, fields_required=False)
+
+        sound_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+
+        return wav, sound_info
+
+    def collater(self, samples):
+        # when training, audio mixing is performed in the collate function
+        wav, sound_info = super().collater(samples)  # SoundDataset always returns infos
+        if self.aug_p &gt; 0:
+            wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
+                                          snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
+                                          min_overlap=self.mix_min_overlap)
+        return wav, sound_info</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></li>
+<li><a title="audiocraft.data.audio_dataset.AudioDataset" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset" href="info_audio_dataset.html#audiocraft.data.info_audio_dataset.InfoAudioDataset">InfoAudioDataset</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.collater" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.collater">collater</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.from_meta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_meta">from_meta</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.from_path" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.from_path">from_path</a></code></li>
+<li><code><a title="audiocraft.data.info_audio_dataset.InfoAudioDataset.sample_file" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioDataset.sample_file">sample_file</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.data.sound_dataset.SoundInfo"><code class="flex name class">
+<span>class <span class="ident">SoundInfo</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int, description: Optional[str] = None, self_wav: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Segment info augmented with Sound metadata.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SoundInfo(SegmentWithAttributes):
+    &#34;&#34;&#34;Segment info augmented with Sound metadata.
+    &#34;&#34;&#34;
+    description: tp.Optional[str] = None
+    self_wav: tp.Optional[torch.Tensor] = None
+
+    @property
+    def has_sound_meta(self) -&gt; bool:
+        return self.description is not None
+
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        out = ConditioningAttributes()
+
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == &#39;self_wav&#39;:
+                out.wav[key] = value
+            else:
+                out.text[key] = value
+        return out
+
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == &#39;description&#39;:
+            preprocess_func = get_keyword_or_keyword_list
+        else:
+            preprocess_func = None
+        return preprocess_func
+
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = [&#39;self_wav&#39;]
+
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required:
+                    raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></li>
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="audio_dataset.html#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.description"><code class="name">var <span class="ident">description</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.self_wav"><code class="name">var <span class="ident">self_wav</span> : Optional[torch.Tensor]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.attribute_getter"><code class="name flex">
+<span>def <span class="ident">attribute_getter</span></span>(<span>attribute)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def attribute_getter(attribute):
+    if attribute == &#39;description&#39;:
+        preprocess_func = get_keyword_or_keyword_list
+    else:
+        preprocess_func = None
+    return preprocess_func</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict, fields_required: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict, fields_required: bool = False):
+    _dictionary: tp.Dict[str, tp.Any] = {}
+
+    # allow a subset of attributes to not be loaded from the dictionary
+    # these attributes may be populated later
+    post_init_attributes = [&#39;self_wav&#39;]
+
+    for _field in fields(cls):
+        if _field.name in post_init_attributes:
+            continue
+        elif _field.name not in dictionary:
+            if fields_required:
+                raise KeyError(f&#34;Unexpected missing key: {_field.name}&#34;)
+        else:
+            preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+            value = dictionary[_field.name]
+            if preprocess_func:
+                value = preprocess_func(value)
+            _dictionary[_field.name] = value
+    return cls(**_dictionary)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.has_sound_meta"><code class="name">var <span class="ident">has_sound_meta</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def has_sound_meta(self) -&gt; bool:
+    return self.description is not None</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.sound_dataset.SoundInfo.to_condition_attributes"><code class="name flex">
+<span>def <span class="ident">to_condition_attributes</span></span>(<span>self) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_condition_attributes(self) -&gt; ConditioningAttributes:
+    out = ConditioningAttributes()
+
+    for _field in fields(self):
+        key, value = _field.name, getattr(self, _field.name)
+        if key == &#39;self_wav&#39;:
+            out.wav[key] = value
+        else:
+            out.text[key] = value
+    return out</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="two-column">
+<li><code><a title="audiocraft.data.sound_dataset.is_clipped" href="#audiocraft.data.sound_dataset.is_clipped">is_clipped</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.mix_pair" href="#audiocraft.data.sound_dataset.mix_pair">mix_pair</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.mix_samples" href="#audiocraft.data.sound_dataset.mix_samples">mix_samples</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.mix_text" href="#audiocraft.data.sound_dataset.mix_text">mix_text</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.normalize" href="#audiocraft.data.sound_dataset.normalize">normalize</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.rms_f" href="#audiocraft.data.sound_dataset.rms_f">rms_f</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.snr_mix" href="#audiocraft.data.sound_dataset.snr_mix">snr_mix</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.snr_mixer" href="#audiocraft.data.sound_dataset.snr_mixer">snr_mixer</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.sound_dataset.SoundDataset" href="#audiocraft.data.sound_dataset.SoundDataset">SoundDataset</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.sound_dataset.SoundInfo" href="#audiocraft.data.sound_dataset.SoundInfo">SoundInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.attribute_getter" href="#audiocraft.data.sound_dataset.SoundInfo.attribute_getter">attribute_getter</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.description" href="#audiocraft.data.sound_dataset.SoundInfo.description">description</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.from_dict" href="#audiocraft.data.sound_dataset.SoundInfo.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.has_sound_meta" href="#audiocraft.data.sound_dataset.SoundInfo.has_sound_meta">has_sound_meta</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.self_wav" href="#audiocraft.data.sound_dataset.SoundInfo.self_wav">self_wav</a></code></li>
+<li><code><a title="audiocraft.data.sound_dataset.SoundInfo.to_condition_attributes" href="#audiocraft.data.sound_dataset.SoundInfo.to_condition_attributes">to_condition_attributes</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/data/zip.html b/api_docs/audiocraft/data/zip.html
new file mode 100644
index 00000000..db13511e
--- /dev/null
+++ b/api_docs/audiocraft/data/zip.html
@@ -0,0 +1,292 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.zip API documentation</title>
+<meta name="description" content="Utility for reading some info from inside a zip file." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.zip</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility for reading some info from inside a zip file.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Utility for reading some info from inside a zip file.
+&#34;&#34;&#34;
+
+import typing
+import zipfile
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing_extensions import Literal
+
+
+DEFAULT_SIZE = 32
+MODE = Literal[&#39;r&#39;, &#39;w&#39;, &#39;x&#39;, &#39;a&#39;]
+
+
+@dataclass(order=True)
+class PathInZip:
+    &#34;&#34;&#34;Hold a path of file within a zip file.
+
+    Args:
+        path (str): The convention is &lt;path_to_zip&gt;:&lt;relative_path_inside_zip&gt;.
+            Let&#39;s assume there is a zip file /some/location/foo.zip
+            and inside of it is a json file located at /data/file1.json,
+            Then we expect path = &#34;/some/location/foo.zip:/data/file1.json&#34;.
+    &#34;&#34;&#34;
+
+    INFO_PATH_SEP = &#39;:&#39;
+    zip_path: str
+    file_path: str
+
+    def __init__(self, path: str) -&gt; None:
+        split_path = path.split(self.INFO_PATH_SEP)
+        assert len(split_path) == 2
+        self.zip_path, self.file_path = split_path
+
+    @classmethod
+    def from_paths(cls, zip_path: str, file_path: str):
+        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
+
+    def __str__(self) -&gt; str:
+        return self.zip_path + self.INFO_PATH_SEP + self.file_path
+
+
+def _open_zip(path: str, mode: MODE = &#39;r&#39;):
+    return zipfile.ZipFile(path, mode)
+
+
+_cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
+
+
+def set_zip_cache_size(max_size: int):
+    &#34;&#34;&#34;Sets the maximal LRU caching for zip file opening.
+
+    Args:
+        max_size (int): the maximal LRU cache.
+    &#34;&#34;&#34;
+    global _cached_open_zip
+    _cached_open_zip = lru_cache(max_size)(_open_zip)
+
+
+def open_file_in_zip(path_in_zip: PathInZip, mode: str = &#39;r&#39;) -&gt; typing.IO:
+    &#34;&#34;&#34;Opens a file stored inside a zip and returns a file-like object.
+
+    Args:
+        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
+        mode (str): The mode in which to open the file with.
+    Returns:
+        A file-like object for PathInZip.
+    &#34;&#34;&#34;
+    zf = _cached_open_zip(path_in_zip.zip_path)
+    return zf.open(path_in_zip.file_path)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.zip.open_file_in_zip"><code class="name flex">
+<span>def <span class="ident">open_file_in_zip</span></span>(<span>path_in_zip: <a title="audiocraft.data.zip.PathInZip" href="#audiocraft.data.zip.PathInZip">PathInZip</a>, mode: str = 'r') ‑> <class 'IO'></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Opens a file stored inside a zip and returns a file-like object.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path_in_zip</code></strong> :&ensp;<code><a title="audiocraft.data.zip.PathInZip" href="#audiocraft.data.zip.PathInZip">PathInZip</a></code></dt>
+<dd>A PathInZip object representing the file to return a file-like object of.</dd>
+<dt><strong><code>mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>The mode in which to open the file with.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>A file-like object for PathInZip.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def open_file_in_zip(path_in_zip: PathInZip, mode: str = &#39;r&#39;) -&gt; typing.IO:
+    &#34;&#34;&#34;Opens a file stored inside a zip and returns a file-like object.
+
+    Args:
+        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
+        mode (str): The mode in which to open the file with.
+    Returns:
+        A file-like object for PathInZip.
+    &#34;&#34;&#34;
+    zf = _cached_open_zip(path_in_zip.zip_path)
+    return zf.open(path_in_zip.file_path)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.zip.set_zip_cache_size"><code class="name flex">
+<span>def <span class="ident">set_zip_cache_size</span></span>(<span>max_size: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sets the maximal LRU caching for zip file opening.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>max_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>the maximal LRU cache.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_zip_cache_size(max_size: int):
+    &#34;&#34;&#34;Sets the maximal LRU caching for zip file opening.
+
+    Args:
+        max_size (int): the maximal LRU cache.
+    &#34;&#34;&#34;
+    global _cached_open_zip
+    _cached_open_zip = lru_cache(max_size)(_open_zip)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip"><code class="flex name class">
+<span>class <span class="ident">PathInZip</span></span>
+<span>(</span><span>path: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Hold a path of file within a zip file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code></dt>
+<dd>The convention is <path_to_zip>:<relative_path_inside_zip>.
+Let's assume there is a zip file /some/location/foo.zip
+and inside of it is a json file located at /data/file1.json,
+Then we expect path = "/some/location/foo.zip:/data/file1.json".</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PathInZip:
+    &#34;&#34;&#34;Hold a path of file within a zip file.
+
+    Args:
+        path (str): The convention is &lt;path_to_zip&gt;:&lt;relative_path_inside_zip&gt;.
+            Let&#39;s assume there is a zip file /some/location/foo.zip
+            and inside of it is a json file located at /data/file1.json,
+            Then we expect path = &#34;/some/location/foo.zip:/data/file1.json&#34;.
+    &#34;&#34;&#34;
+
+    INFO_PATH_SEP = &#39;:&#39;
+    zip_path: str
+    file_path: str
+
+    def __init__(self, path: str) -&gt; None:
+        split_path = path.split(self.INFO_PATH_SEP)
+        assert len(split_path) == 2
+        self.zip_path, self.file_path = split_path
+
+    @classmethod
+    def from_paths(cls, zip_path: str, file_path: str):
+        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
+
+    def __str__(self) -&gt; str:
+        return self.zip_path + self.INFO_PATH_SEP + self.file_path</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip.INFO_PATH_SEP"><code class="name">var <span class="ident">INFO_PATH_SEP</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.zip.PathInZip.file_path"><code class="name">var <span class="ident">file_path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.zip.PathInZip.zip_path"><code class="name">var <span class="ident">zip_path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip.from_paths"><code class="name flex">
+<span>def <span class="ident">from_paths</span></span>(<span>zip_path: str, file_path: str)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_paths(cls, zip_path: str, file_path: str):
+    return cls(zip_path + cls.INFO_PATH_SEP + file_path)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.zip.open_file_in_zip" href="#audiocraft.data.zip.open_file_in_zip">open_file_in_zip</a></code></li>
+<li><code><a title="audiocraft.data.zip.set_zip_cache_size" href="#audiocraft.data.zip.set_zip_cache_size">set_zip_cache_size</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.zip.PathInZip" href="#audiocraft.data.zip.PathInZip">PathInZip</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.zip.PathInZip.INFO_PATH_SEP" href="#audiocraft.data.zip.PathInZip.INFO_PATH_SEP">INFO_PATH_SEP</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.file_path" href="#audiocraft.data.zip.PathInZip.file_path">file_path</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.from_paths" href="#audiocraft.data.zip.PathInZip.from_paths">from_paths</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.zip_path" href="#audiocraft.data.zip.PathInZip.zip_path">zip_path</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/environment.html b/api_docs/audiocraft/environment.html
new file mode 100644
index 00000000..7d7f61ac
--- /dev/null
+++ b/api_docs/audiocraft/environment.html
@@ -0,0 +1,669 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.environment API documentation</title>
+<meta name="description" content="Provides cluster and tools configuration across clusters (slurm, dora, utilities)." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.environment</code></h1>
+</header>
+<section id="section-intro">
+<p>Provides cluster and tools configuration across clusters (slurm, dora, utilities).</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Provides cluster and tools configuration across clusters (slurm, dora, utilities).
+&#34;&#34;&#34;
+
+import logging
+import os
+from pathlib import Path
+import re
+import typing as tp
+
+import omegaconf
+
+from .utils.cluster import _guess_cluster_type
+
+
+logger = logging.getLogger(__name__)
+
+
+class AudioCraftEnvironment:
+    &#34;&#34;&#34;Environment configuration for teams and clusters.
+
+    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
+    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
+    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
+    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
+    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
+
+    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
+    Use the following environment variables to specify the cluster, team or configuration:
+
+        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
+            cannot be inferred automatically.
+        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
+            If not set, configuration is read from config/teams.yaml.
+        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
+            Cluster configuration are shared across teams to match compute allocation,
+            specify your cluster configuration in the configuration file under a key mapping
+            your team name.
+    &#34;&#34;&#34;
+    _instance = None
+    DEFAULT_TEAM = &#34;default&#34;
+
+    def __init__(self) -&gt; None:
+        &#34;&#34;&#34;Loads configuration.&#34;&#34;&#34;
+        self.team: str = os.getenv(&#34;AUDIOCRAFT_TEAM&#34;, self.DEFAULT_TEAM)
+        cluster_type = _guess_cluster_type()
+        cluster = os.getenv(
+            &#34;AUDIOCRAFT_CLUSTER&#34;, cluster_type.value
+        )
+        logger.info(&#34;Detecting cluster type %s&#34;, cluster_type)
+
+        self.cluster: str = cluster
+
+        config_path = os.getenv(
+            &#34;AUDIOCRAFT_CONFIG&#34;,
+            Path(__file__)
+            .parent.parent.joinpath(&#34;config/teams&#34;, self.team)
+            .with_suffix(&#34;.yaml&#34;),
+        )
+        self.config = omegaconf.OmegaConf.load(config_path)
+        self._dataset_mappers = []
+        cluster_config = self._get_cluster_config()
+        if &#34;dataset_mappers&#34; in cluster_config:
+            for pattern, repl in cluster_config[&#34;dataset_mappers&#34;].items():
+                regex = re.compile(pattern)
+                self._dataset_mappers.append((regex, repl))
+
+    def _get_cluster_config(self) -&gt; omegaconf.DictConfig:
+        assert isinstance(self.config, omegaconf.DictConfig)
+        return self.config[self.cluster]
+
+    @classmethod
+    def instance(cls):
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def reset(cls):
+        &#34;&#34;&#34;Clears the environment and forces a reload on next invocation.&#34;&#34;&#34;
+        cls._instance = None
+
+    @classmethod
+    def get_team(cls) -&gt; str:
+        &#34;&#34;&#34;Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+        If not defined, defaults to &#34;labs&#34;.
+        &#34;&#34;&#34;
+        return cls.instance().team
+
+    @classmethod
+    def get_cluster(cls) -&gt; str:
+        &#34;&#34;&#34;Gets the detected cluster.
+        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
+        &#34;&#34;&#34;
+        return cls.instance().cluster
+
+    @classmethod
+    def get_dora_dir(cls) -&gt; Path:
+        &#34;&#34;&#34;Gets the path to the dora directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
+        &#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        dora_dir = os.getenv(&#34;AUDIOCRAFT_DORA_DIR&#34;, cluster_config[&#34;dora_dir&#34;])
+        logger.warning(f&#34;Dora directory: {dora_dir}&#34;)
+        return Path(dora_dir)
+
+    @classmethod
+    def get_reference_dir(cls) -&gt; Path:
+        &#34;&#34;&#34;Gets the path to the reference directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
+        &#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        return Path(os.getenv(&#34;AUDIOCRAFT_REFERENCE_DIR&#34;, cluster_config[&#34;reference_dir&#34;]))
+
+    @classmethod
+    def get_slurm_exclude(cls) -&gt; tp.Optional[str]:
+        &#34;&#34;&#34;Get the list of nodes to exclude for that cluster.&#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        return cluster_config.get(&#34;slurm_exclude&#34;)
+
+    @classmethod
+    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -&gt; str:
+        &#34;&#34;&#34;Gets the requested partitions for the current team and cluster as a comma-separated string.
+
+        Args:
+            partition_types (list[str], optional): partition types to retrieve. Values must be
+                from [&#39;global&#39;, &#39;team&#39;]. If not provided, the global partition is returned.
+        &#34;&#34;&#34;
+        if not partition_types:
+            partition_types = [&#34;global&#34;]
+
+        cluster_config = cls.instance()._get_cluster_config()
+        partitions = [
+            cluster_config[&#34;partitions&#34;][partition_type]
+            for partition_type in partition_types
+        ]
+        return &#34;,&#34;.join(partitions)
+
+    @classmethod
+    def resolve_reference_path(cls, path: tp.Union[str, Path]) -&gt; Path:
+        &#34;&#34;&#34;Converts reference placeholder in path with configured reference dir to resolve paths.
+
+        Args:
+            path (str or Path): Path to resolve.
+        Returns:
+            Path: Resolved path.
+        &#34;&#34;&#34;
+        path = str(path)
+
+        if path.startswith(&#34;//reference&#34;):
+            reference_dir = cls.get_reference_dir()
+            logger.warn(f&#34;Reference directory: {reference_dir}&#34;)
+            assert (
+                reference_dir.exists() and reference_dir.is_dir()
+            ), f&#34;Reference directory does not exist: {reference_dir}.&#34;
+            path = re.sub(&#34;^//reference&#34;, str(reference_dir), path)
+
+        return Path(path)
+
+    @classmethod
+    def apply_dataset_mappers(cls, path: str) -&gt; str:
+        &#34;&#34;&#34;Applies dataset mapping regex rules as defined in the configuration.
+        If no rules are defined, the path is returned as-is.
+        &#34;&#34;&#34;
+        instance = cls.instance()
+
+        for pattern, repl in instance._dataset_mappers:
+            path = pattern.sub(repl, path)
+
+        return path</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.environment.AudioCraftEnvironment"><code class="flex name class">
+<span>class <span class="ident">AudioCraftEnvironment</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Environment configuration for teams and clusters.</p>
+<p>AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
+or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
+provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
+allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
+map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.</p>
+<p>The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
+Use the following environment variables to specify the cluster, team or configuration:</p>
+<pre><code>AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
+    cannot be inferred automatically.
+AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
+    If not set, configuration is read from config/teams.yaml.
+AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
+    Cluster configuration are shared across teams to match compute allocation,
+    specify your cluster configuration in the configuration file under a key mapping
+    your team name.
+</code></pre>
+<p>Loads configuration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioCraftEnvironment:
+    &#34;&#34;&#34;Environment configuration for teams and clusters.
+
+    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
+    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
+    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
+    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
+    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
+
+    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
+    Use the following environment variables to specify the cluster, team or configuration:
+
+        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
+            cannot be inferred automatically.
+        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
+            If not set, configuration is read from config/teams.yaml.
+        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
+            Cluster configuration are shared across teams to match compute allocation,
+            specify your cluster configuration in the configuration file under a key mapping
+            your team name.
+    &#34;&#34;&#34;
+    _instance = None
+    DEFAULT_TEAM = &#34;default&#34;
+
+    def __init__(self) -&gt; None:
+        &#34;&#34;&#34;Loads configuration.&#34;&#34;&#34;
+        self.team: str = os.getenv(&#34;AUDIOCRAFT_TEAM&#34;, self.DEFAULT_TEAM)
+        cluster_type = _guess_cluster_type()
+        cluster = os.getenv(
+            &#34;AUDIOCRAFT_CLUSTER&#34;, cluster_type.value
+        )
+        logger.info(&#34;Detecting cluster type %s&#34;, cluster_type)
+
+        self.cluster: str = cluster
+
+        config_path = os.getenv(
+            &#34;AUDIOCRAFT_CONFIG&#34;,
+            Path(__file__)
+            .parent.parent.joinpath(&#34;config/teams&#34;, self.team)
+            .with_suffix(&#34;.yaml&#34;),
+        )
+        self.config = omegaconf.OmegaConf.load(config_path)
+        self._dataset_mappers = []
+        cluster_config = self._get_cluster_config()
+        if &#34;dataset_mappers&#34; in cluster_config:
+            for pattern, repl in cluster_config[&#34;dataset_mappers&#34;].items():
+                regex = re.compile(pattern)
+                self._dataset_mappers.append((regex, repl))
+
+    def _get_cluster_config(self) -&gt; omegaconf.DictConfig:
+        assert isinstance(self.config, omegaconf.DictConfig)
+        return self.config[self.cluster]
+
+    @classmethod
+    def instance(cls):
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def reset(cls):
+        &#34;&#34;&#34;Clears the environment and forces a reload on next invocation.&#34;&#34;&#34;
+        cls._instance = None
+
+    @classmethod
+    def get_team(cls) -&gt; str:
+        &#34;&#34;&#34;Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+        If not defined, defaults to &#34;labs&#34;.
+        &#34;&#34;&#34;
+        return cls.instance().team
+
+    @classmethod
+    def get_cluster(cls) -&gt; str:
+        &#34;&#34;&#34;Gets the detected cluster.
+        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
+        &#34;&#34;&#34;
+        return cls.instance().cluster
+
+    @classmethod
+    def get_dora_dir(cls) -&gt; Path:
+        &#34;&#34;&#34;Gets the path to the dora directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
+        &#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        dora_dir = os.getenv(&#34;AUDIOCRAFT_DORA_DIR&#34;, cluster_config[&#34;dora_dir&#34;])
+        logger.warning(f&#34;Dora directory: {dora_dir}&#34;)
+        return Path(dora_dir)
+
+    @classmethod
+    def get_reference_dir(cls) -&gt; Path:
+        &#34;&#34;&#34;Gets the path to the reference directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
+        &#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        return Path(os.getenv(&#34;AUDIOCRAFT_REFERENCE_DIR&#34;, cluster_config[&#34;reference_dir&#34;]))
+
+    @classmethod
+    def get_slurm_exclude(cls) -&gt; tp.Optional[str]:
+        &#34;&#34;&#34;Get the list of nodes to exclude for that cluster.&#34;&#34;&#34;
+        cluster_config = cls.instance()._get_cluster_config()
+        return cluster_config.get(&#34;slurm_exclude&#34;)
+
+    @classmethod
+    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -&gt; str:
+        &#34;&#34;&#34;Gets the requested partitions for the current team and cluster as a comma-separated string.
+
+        Args:
+            partition_types (list[str], optional): partition types to retrieve. Values must be
+                from [&#39;global&#39;, &#39;team&#39;]. If not provided, the global partition is returned.
+        &#34;&#34;&#34;
+        if not partition_types:
+            partition_types = [&#34;global&#34;]
+
+        cluster_config = cls.instance()._get_cluster_config()
+        partitions = [
+            cluster_config[&#34;partitions&#34;][partition_type]
+            for partition_type in partition_types
+        ]
+        return &#34;,&#34;.join(partitions)
+
+    @classmethod
+    def resolve_reference_path(cls, path: tp.Union[str, Path]) -&gt; Path:
+        &#34;&#34;&#34;Converts reference placeholder in path with configured reference dir to resolve paths.
+
+        Args:
+            path (str or Path): Path to resolve.
+        Returns:
+            Path: Resolved path.
+        &#34;&#34;&#34;
+        path = str(path)
+
+        if path.startswith(&#34;//reference&#34;):
+            reference_dir = cls.get_reference_dir()
+            logger.warn(f&#34;Reference directory: {reference_dir}&#34;)
+            assert (
+                reference_dir.exists() and reference_dir.is_dir()
+            ), f&#34;Reference directory does not exist: {reference_dir}.&#34;
+            path = re.sub(&#34;^//reference&#34;, str(reference_dir), path)
+
+        return Path(path)
+
+    @classmethod
+    def apply_dataset_mappers(cls, path: str) -&gt; str:
+        &#34;&#34;&#34;Applies dataset mapping regex rules as defined in the configuration.
+        If no rules are defined, the path is returned as-is.
+        &#34;&#34;&#34;
+        instance = cls.instance()
+
+        for pattern, repl in instance._dataset_mappers:
+            path = pattern.sub(repl, path)
+
+        return path</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.environment.AudioCraftEnvironment.DEFAULT_TEAM"><code class="name">var <span class="ident">DEFAULT_TEAM</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.environment.AudioCraftEnvironment.apply_dataset_mappers"><code class="name flex">
+<span>def <span class="ident">apply_dataset_mappers</span></span>(<span>path: str) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Applies dataset mapping regex rules as defined in the configuration.
+If no rules are defined, the path is returned as-is.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def apply_dataset_mappers(cls, path: str) -&gt; str:
+    &#34;&#34;&#34;Applies dataset mapping regex rules as defined in the configuration.
+    If no rules are defined, the path is returned as-is.
+    &#34;&#34;&#34;
+    instance = cls.instance()
+
+    for pattern, repl in instance._dataset_mappers:
+        path = pattern.sub(repl, path)
+
+    return path</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_cluster"><code class="name flex">
+<span>def <span class="ident">get_cluster</span></span>(<span>) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets the detected cluster.
+This value can be overridden by the AUDIOCRAFT_CLUSTER env var.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_cluster(cls) -&gt; str:
+    &#34;&#34;&#34;Gets the detected cluster.
+    This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
+    &#34;&#34;&#34;
+    return cls.instance().cluster</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_dora_dir"><code class="name flex">
+<span>def <span class="ident">get_dora_dir</span></span>(<span>) ‑> pathlib.Path</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets the path to the dora directory for the current team and cluster.
+Value is overridden by the AUDIOCRAFT_DORA_DIR env var.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_dora_dir(cls) -&gt; Path:
+    &#34;&#34;&#34;Gets the path to the dora directory for the current team and cluster.
+    Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
+    &#34;&#34;&#34;
+    cluster_config = cls.instance()._get_cluster_config()
+    dora_dir = os.getenv(&#34;AUDIOCRAFT_DORA_DIR&#34;, cluster_config[&#34;dora_dir&#34;])
+    logger.warning(f&#34;Dora directory: {dora_dir}&#34;)
+    return Path(dora_dir)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_reference_dir"><code class="name flex">
+<span>def <span class="ident">get_reference_dir</span></span>(<span>) ‑> pathlib.Path</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets the path to the reference directory for the current team and cluster.
+Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_reference_dir(cls) -&gt; Path:
+    &#34;&#34;&#34;Gets the path to the reference directory for the current team and cluster.
+    Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
+    &#34;&#34;&#34;
+    cluster_config = cls.instance()._get_cluster_config()
+    return Path(os.getenv(&#34;AUDIOCRAFT_REFERENCE_DIR&#34;, cluster_config[&#34;reference_dir&#34;]))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_slurm_exclude"><code class="name flex">
+<span>def <span class="ident">get_slurm_exclude</span></span>(<span>) ‑> Optional[str]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get the list of nodes to exclude for that cluster.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_slurm_exclude(cls) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Get the list of nodes to exclude for that cluster.&#34;&#34;&#34;
+    cluster_config = cls.instance()._get_cluster_config()
+    return cluster_config.get(&#34;slurm_exclude&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_slurm_partitions"><code class="name flex">
+<span>def <span class="ident">get_slurm_partitions</span></span>(<span>partition_types: Optional[List[str]] = None) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets the requested partitions for the current team and cluster as a comma-separated string.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>partition_types</code></strong> :&ensp;<code>list[str]</code>, optional</dt>
+<dd>partition types to retrieve. Values must be
+from ['global', 'team']. If not provided, the global partition is returned.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -&gt; str:
+    &#34;&#34;&#34;Gets the requested partitions for the current team and cluster as a comma-separated string.
+
+    Args:
+        partition_types (list[str], optional): partition types to retrieve. Values must be
+            from [&#39;global&#39;, &#39;team&#39;]. If not provided, the global partition is returned.
+    &#34;&#34;&#34;
+    if not partition_types:
+        partition_types = [&#34;global&#34;]
+
+    cluster_config = cls.instance()._get_cluster_config()
+    partitions = [
+        cluster_config[&#34;partitions&#34;][partition_type]
+        for partition_type in partition_types
+    ]
+    return &#34;,&#34;.join(partitions)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.get_team"><code class="name flex">
+<span>def <span class="ident">get_team</span></span>(<span>) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+If not defined, defaults to "labs".</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def get_team(cls) -&gt; str:
+    &#34;&#34;&#34;Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+    If not defined, defaults to &#34;labs&#34;.
+    &#34;&#34;&#34;
+    return cls.instance().team</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.instance"><code class="name flex">
+<span>def <span class="ident">instance</span></span>(<span>)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def instance(cls):
+    if cls._instance is None:
+        cls._instance = cls()
+    return cls._instance</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.reset"><code class="name flex">
+<span>def <span class="ident">reset</span></span>(<span>)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Clears the environment and forces a reload on next invocation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def reset(cls):
+    &#34;&#34;&#34;Clears the environment and forces a reload on next invocation.&#34;&#34;&#34;
+    cls._instance = None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.environment.AudioCraftEnvironment.resolve_reference_path"><code class="name flex">
+<span>def <span class="ident">resolve_reference_path</span></span>(<span>path: Union[str, pathlib.Path]) ‑> pathlib.Path</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Converts reference placeholder in path with configured reference dir to resolve paths.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to resolve.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>Path</code></dt>
+<dd>Resolved path.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def resolve_reference_path(cls, path: tp.Union[str, Path]) -&gt; Path:
+    &#34;&#34;&#34;Converts reference placeholder in path with configured reference dir to resolve paths.
+
+    Args:
+        path (str or Path): Path to resolve.
+    Returns:
+        Path: Resolved path.
+    &#34;&#34;&#34;
+    path = str(path)
+
+    if path.startswith(&#34;//reference&#34;):
+        reference_dir = cls.get_reference_dir()
+        logger.warn(f&#34;Reference directory: {reference_dir}&#34;)
+        assert (
+            reference_dir.exists() and reference_dir.is_dir()
+        ), f&#34;Reference directory does not exist: {reference_dir}.&#34;
+        path = re.sub(&#34;^//reference&#34;, str(reference_dir), path)
+
+    return Path(path)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.environment.AudioCraftEnvironment" href="#audiocraft.environment.AudioCraftEnvironment">AudioCraftEnvironment</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.DEFAULT_TEAM" href="#audiocraft.environment.AudioCraftEnvironment.DEFAULT_TEAM">DEFAULT_TEAM</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.apply_dataset_mappers" href="#audiocraft.environment.AudioCraftEnvironment.apply_dataset_mappers">apply_dataset_mappers</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_cluster" href="#audiocraft.environment.AudioCraftEnvironment.get_cluster">get_cluster</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_dora_dir" href="#audiocraft.environment.AudioCraftEnvironment.get_dora_dir">get_dora_dir</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_reference_dir" href="#audiocraft.environment.AudioCraftEnvironment.get_reference_dir">get_reference_dir</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_slurm_exclude" href="#audiocraft.environment.AudioCraftEnvironment.get_slurm_exclude">get_slurm_exclude</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_slurm_partitions" href="#audiocraft.environment.AudioCraftEnvironment.get_slurm_partitions">get_slurm_partitions</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.get_team" href="#audiocraft.environment.AudioCraftEnvironment.get_team">get_team</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.instance" href="#audiocraft.environment.AudioCraftEnvironment.instance">instance</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.reset" href="#audiocraft.environment.AudioCraftEnvironment.reset">reset</a></code></li>
+<li><code><a title="audiocraft.environment.AudioCraftEnvironment.resolve_reference_path" href="#audiocraft.environment.AudioCraftEnvironment.resolve_reference_path">resolve_reference_path</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/audiogen/audiogen_base_16khz.html b/api_docs/audiocraft/grids/audiogen/audiogen_base_16khz.html
new file mode 100644
index 00000000..241ebed6
--- /dev/null
+++ b/api_docs/audiocraft/grids/audiogen/audiogen_base_16khz.html
@@ -0,0 +1,81 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.audiogen.audiogen_base_16khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.audiogen.audiogen_base_16khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ..musicgen._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=64, partition=partitions)
+    launcher.bind_(solver=&#39;audiogen/audiogen_base_16khz&#39;)
+    # replace this by the desired environmental sound dataset
+    launcher.bind_(dset=&#39;internal/sounds_16khz&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    medium = {&#39;model/lm/model_scale&#39;: &#39;medium&#39;}
+
+    launcher.bind_(fsdp)
+    launcher(medium)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.audiogen" href="index.html">audiocraft.grids.audiogen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.html b/api_docs/audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.html
new file mode 100644
index 00000000..dd985039
--- /dev/null
+++ b/api_docs/audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.html
@@ -0,0 +1,173 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval API documentation</title>
+<meta name="description" content="Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval</code></h1>
+</header>
+<section id="section-intro">
+<p>Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.</p>
+<p>When running the grid for the first time, please use:
+REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.</p>
+<p>Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+
+When running the grid for the first time, please use:
+REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+&#34;&#34;&#34;
+
+import os
+
+from ..musicgen._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+
+
+def eval(launcher, batch_size: int = 32):
+    opts = {
+        &#39;dset&#39;: &#39;audio/audiocaps_16khz&#39;,
+        &#39;solver/audiogen/evaluation&#39;: &#39;objective_eval&#39;,
+        &#39;execute_only&#39;: &#39;evaluate&#39;,
+        &#39;+dataset.evaluate.batch_size&#39;: batch_size,
+        &#39;+metrics.fad.tf.batch_size&#39;: 32,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        &#39;metrics.fad.tf.bin&#39;: &#39;/data/home/jadecopet/local/usr/opt/google-research&#39;
+    }
+    opt1 = {&#39;generate.lm.use_sampling&#39;: True, &#39;generate.lm.top_k&#39;: 250, &#39;generate.lm.top_p&#39;: 0.}
+    opt2 = {&#39;transformer_lm.two_step_cfg&#39;: True}
+
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+
+    # base objective metrics
+    sub(opt1, opt2)
+
+
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=4, partition=partitions)
+
+    if &#39;REGEN&#39; not in os.environ:
+        folder = train.main.dora.dir / &#39;grids&#39; / __name__.split(&#39;.&#39;, 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+
+    audiogen_base = launcher.bind(solver=&#34;audiogen/audiogen_base_16khz&#34;)
+    audiogen_base.bind_({&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True})
+
+    audiogen_base_medium = audiogen_base.bind({&#39;continue_from&#39;: &#39;//pretrained/facebook/audiogen-medium&#39;})
+    audiogen_base_medium.bind_({&#39;model/lm/model_scale&#39;: &#39;medium&#39;})
+    eval(audiogen_base_medium, batch_size=128)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval.eval"><code class="name flex">
+<span>def <span class="ident">eval</span></span>(<span>launcher, batch_size: int = 32)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def eval(launcher, batch_size: int = 32):
+    opts = {
+        &#39;dset&#39;: &#39;audio/audiocaps_16khz&#39;,
+        &#39;solver/audiogen/evaluation&#39;: &#39;objective_eval&#39;,
+        &#39;execute_only&#39;: &#39;evaluate&#39;,
+        &#39;+dataset.evaluate.batch_size&#39;: batch_size,
+        &#39;+metrics.fad.tf.batch_size&#39;: 32,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        &#39;metrics.fad.tf.bin&#39;: &#39;/data/home/jadecopet/local/usr/opt/google-research&#39;
+    }
+    opt1 = {&#39;generate.lm.use_sampling&#39;: True, &#39;generate.lm.top_k&#39;: 250, &#39;generate.lm.top_p&#39;: 0.}
+    opt2 = {&#39;transformer_lm.two_step_cfg&#39;: True}
+
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+
+    # base objective metrics
+    sub(opt1, opt2)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.audiogen" href="index.html">audiocraft.grids.audiogen</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval.eval" href="#audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval.eval">eval</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/audiogen/index.html b/api_docs/audiocraft/grids/audiogen/index.html
new file mode 100644
index 00000000..da0bc049
--- /dev/null
+++ b/api_docs/audiocraft/grids/audiogen/index.html
@@ -0,0 +1,83 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.audiogen API documentation</title>
+<meta name="description" content="AudioGen grids." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.audiogen</code></h1>
+</header>
+<section id="section-intro">
+<p>AudioGen grids.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;AudioGen grids.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.grids.audiogen.audiogen_base_16khz" href="audiogen_base_16khz.html">audiocraft.grids.audiogen.audiogen_base_16khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval" href="audiogen_pretrained_16khz_eval.html">audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval</a></code></dt>
+<dd>
+<div class="desc"><p>Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage …</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids" href="../index.html">audiocraft.grids</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.grids.audiogen.audiogen_base_16khz" href="audiogen_base_16khz.html">audiocraft.grids.audiogen.audiogen_base_16khz</a></code></li>
+<li><code><a title="audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval" href="audiogen_pretrained_16khz_eval.html">audiocraft.grids.audiogen.audiogen_pretrained_16khz_eval</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/compression/debug.html b/api_docs/audiocraft/grids/compression/debug.html
new file mode 100644
index 00000000..6e5bd044
--- /dev/null
+++ b/api_docs/audiocraft/grids/compression/debug.html
@@ -0,0 +1,97 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.compression.debug API documentation</title>
+<meta name="description" content="Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.compression.debug</code></h1>
+</header>
+<section id="section-intro">
+<p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.</p>
+<p>This grid is a minimal example for debugging compression task
+and how to override parameters directly in a grid.
+Learn more about dora grids: <a href="https://github.com/facebookresearch/dora">https://github.com/facebookresearch/dora</a></p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+
+This grid is a minimal example for debugging compression task
+and how to override parameters directly in a grid.
+Learn more about dora grids: https://github.com/facebookresearch/dora
+&#34;&#34;&#34;
+
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=2, partition=partitions)
+    launcher.bind_(solver=&#39;compression/debug&#39;)
+
+    with launcher.job_array():
+        # base debug task using config from solver=compression/debug
+        launcher()
+        # we can override parameters in the grid to launch additional xps
+        launcher({&#39;rvq.bins&#39;: 2048, &#39;rvq.n_q&#39;: 4})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.compression" href="index.html">audiocraft.grids.compression</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/compression/encodec_audiogen_16khz.html b/api_docs/audiocraft/grids/compression/encodec_audiogen_16khz.html
new file mode 100644
index 00000000..37248616
--- /dev/null
+++ b/api_docs/audiocraft/grids/compression/encodec_audiogen_16khz.html
@@ -0,0 +1,93 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.compression.encodec_audiogen_16khz API documentation</title>
+<meta name="description" content="Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.compression.encodec_audiogen_16khz</code></h1>
+</header>
+<section id="section-intro">
+<p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.</p>
+<p>This grid shows how to train the new AudioGen EnCodec model at 16 kHz.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+
+This grid shows how to train the new AudioGen EnCodec model at 16 kHz.
+&#34;&#34;&#34;
+
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for AudioGen&#39;s EnCodec model trained on monophonic audio sampled at 16 kHz
+    # AudioGen&#39;s EnCodec is trained with a total stride of 320 leading to a frame rate of 50 hz
+    launcher.bind_(solver=&#39;compression/encodec_audiogen_16khz&#39;)
+    # replace this by the desired sound dataset
+    launcher.bind_(dset=&#39;internal/sounds_16khz&#39;)
+    # launch xp
+    launcher()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.compression" href="index.html">audiocraft.grids.compression</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/compression/encodec_base_24khz.html b/api_docs/audiocraft/grids/compression/encodec_base_24khz.html
new file mode 100644
index 00000000..7433cb3e
--- /dev/null
+++ b/api_docs/audiocraft/grids/compression/encodec_base_24khz.html
@@ -0,0 +1,92 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.compression.encodec_base_24khz API documentation</title>
+<meta name="description" content="Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.compression.encodec_base_24khz</code></h1>
+</header>
+<section id="section-intro">
+<p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.</p>
+<p>This grid shows how to train a base causal EnCodec model at 24 kHz.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+
+This grid shows how to train a base causal EnCodec model at 24 kHz.
+&#34;&#34;&#34;
+
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # base causal EnCodec trained on monophonic audio sampled at 24 kHz
+    launcher.bind_(solver=&#39;compression/encodec_base_24khz&#39;)
+    # replace this by the desired dataset
+    launcher.bind_(dset=&#39;audio/example&#39;)
+    # launch xp
+    launcher()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.compression" href="index.html">audiocraft.grids.compression</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/compression/encodec_musicgen_32khz.html b/api_docs/audiocraft/grids/compression/encodec_musicgen_32khz.html
new file mode 100644
index 00000000..d8008e0d
--- /dev/null
+++ b/api_docs/audiocraft/grids/compression/encodec_musicgen_32khz.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.compression.encodec_musicgen_32khz API documentation</title>
+<meta name="description" content="Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.compression.encodec_musicgen_32khz</code></h1>
+</header>
+<section id="section-intro">
+<p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.</p>
+<p>This grid shows how to train a MusicGen EnCodec model at 32 kHz.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+
+This grid shows how to train a MusicGen EnCodec model at 32 kHz.
+&#34;&#34;&#34;
+
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for MusicGen&#39;s EnCodec model trained on monophonic audio sampled at 32 kHz
+    # MusicGen&#39;s EnCodec is trained with a total stride of 640 leading to a frame rate of 50 hz
+    launcher.bind_(solver=&#39;compression/encodec_musicgen_32khz&#39;)
+    # replace this by the desired music dataset
+    launcher.bind_(dset=&#39;internal/music_400k_32khz&#39;)
+    # launch xp
+    launcher()
+    launcher({
+        &#39;metrics.visqol.bin&#39;: &#39;/data/home/jadecopet/local/usr/opt/visqol&#39;,
+        &#39;label&#39;: &#39;visqol&#39;,
+        &#39;evaluate.metrics.visqol&#39;: True
+    })</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.compression" href="index.html">audiocraft.grids.compression</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/compression/index.html b/api_docs/audiocraft/grids/compression/index.html
new file mode 100644
index 00000000..8c0d27f7
--- /dev/null
+++ b/api_docs/audiocraft/grids/compression/index.html
@@ -0,0 +1,100 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.compression API documentation</title>
+<meta name="description" content="EnCodec grids." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.compression</code></h1>
+</header>
+<section id="section-intro">
+<p>EnCodec grids.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;EnCodec grids.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.grids.compression.debug" href="debug.html">audiocraft.grids.compression.debug</a></code></dt>
+<dd>
+<div class="desc"><p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.compression.encodec_audiogen_16khz" href="encodec_audiogen_16khz.html">audiocraft.grids.compression.encodec_audiogen_16khz</a></code></dt>
+<dd>
+<div class="desc"><p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.compression.encodec_base_24khz" href="encodec_base_24khz.html">audiocraft.grids.compression.encodec_base_24khz</a></code></dt>
+<dd>
+<div class="desc"><p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.compression.encodec_musicgen_32khz" href="encodec_musicgen_32khz.html">audiocraft.grids.compression.encodec_musicgen_32khz</a></code></dt>
+<dd>
+<div class="desc"><p>Grid search file, simply list all the exp you want in <code>explorer</code>.
+Any new exp added there will be scheduled.
+You can cancel and experiment by …</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids" href="../index.html">audiocraft.grids</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.grids.compression.debug" href="debug.html">audiocraft.grids.compression.debug</a></code></li>
+<li><code><a title="audiocraft.grids.compression.encodec_audiogen_16khz" href="encodec_audiogen_16khz.html">audiocraft.grids.compression.encodec_audiogen_16khz</a></code></li>
+<li><code><a title="audiocraft.grids.compression.encodec_base_24khz" href="encodec_base_24khz.html">audiocraft.grids.compression.encodec_base_24khz</a></code></li>
+<li><code><a title="audiocraft.grids.compression.encodec_musicgen_32khz" href="encodec_musicgen_32khz.html">audiocraft.grids.compression.encodec_musicgen_32khz</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/diffusion/4_bands_base_32khz.html b/api_docs/audiocraft/grids/diffusion/4_bands_base_32khz.html
new file mode 100644
index 00000000..ee50f6b9
--- /dev/null
+++ b/api_docs/audiocraft/grids/diffusion/4_bands_base_32khz.html
@@ -0,0 +1,90 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.diffusion.4_bands_base_32khz API documentation</title>
+<meta name="description" content="Training of the 4 diffusion models described in
+&#34;From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion&#34;
+(paper link)." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.diffusion.4_bands_base_32khz</code></h1>
+</header>
+<section id="section-intro">
+<p>Training of the 4 diffusion models described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Training of the 4 diffusion models described in
+&#34;From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion&#34;
+(paper link).
+&#34;&#34;&#34;
+
+from ._explorers import DiffusionExplorer
+
+
+@DiffusionExplorer
+def explorer(launcher):
+    launcher.slurm_(gpus=4, partition=&#39;learnfair&#39;)
+
+    launcher.bind_({&#39;solver&#39;: &#39;diffusion/default&#39;,
+                    &#39;dset&#39;: &#39;internal/music_10k_32khz&#39;})
+
+    with launcher.job_array():
+        launcher({&#39;filter.use&#39;: True, &#39;filter.idx_band&#39;: 0, &#34;processor.use&#34;: False, &#39;processor.power_std&#39;: 0.4})
+        launcher({&#39;filter.use&#39;: True, &#39;filter.idx_band&#39;: 1, &#34;processor.use&#34;: False, &#39;processor.power_std&#39;: 0.4})
+        launcher({&#39;filter.use&#39;: True, &#39;filter.idx_band&#39;: 2, &#34;processor.use&#34;: True, &#39;processor.power_std&#39;: 0.4})
+        launcher({&#39;filter.use&#39;: True, &#39;filter.idx_band&#39;: 3, &#34;processor.use&#34;: True, &#39;processor.power_std&#39;: 0.75})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.diffusion" href="index.html">audiocraft.grids.diffusion</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/diffusion/index.html b/api_docs/audiocraft/grids/diffusion/index.html
new file mode 100644
index 00000000..c3aa796e
--- /dev/null
+++ b/api_docs/audiocraft/grids/diffusion/index.html
@@ -0,0 +1,79 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.diffusion API documentation</title>
+<meta name="description" content="Diffusion grids." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.diffusion</code></h1>
+</header>
+<section id="section-intro">
+<p>Diffusion grids.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Diffusion grids.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.grids.diffusion.4_bands_base_32khz" href="4_bands_base_32khz.html">audiocraft.grids.diffusion.4_bands_base_32khz</a></code></dt>
+<dd>
+<div class="desc"><p>Training of the 4 diffusion models described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids" href="../index.html">audiocraft.grids</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.grids.diffusion.4_bands_base_32khz" href="4_bands_base_32khz.html">audiocraft.grids.diffusion.4_bands_base_32khz</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/index.html b/api_docs/audiocraft/grids/index.html
new file mode 100644
index 00000000..d530cfe1
--- /dev/null
+++ b/api_docs/audiocraft/grids/index.html
@@ -0,0 +1,92 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids API documentation</title>
+<meta name="description" content="Dora Grids." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids</code></h1>
+</header>
+<section id="section-intro">
+<p>Dora Grids.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Dora Grids.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.grids.audiogen" href="audiogen/index.html">audiocraft.grids.audiogen</a></code></dt>
+<dd>
+<div class="desc"><p>AudioGen grids.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.compression" href="compression/index.html">audiocraft.grids.compression</a></code></dt>
+<dd>
+<div class="desc"><p>EnCodec grids.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.diffusion" href="diffusion/index.html">audiocraft.grids.diffusion</a></code></dt>
+<dd>
+<div class="desc"><p>Diffusion grids.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen" href="musicgen/index.html">audiocraft.grids.musicgen</a></code></dt>
+<dd>
+<div class="desc"><p>MusicGen grids.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.grids.audiogen" href="audiogen/index.html">audiocraft.grids.audiogen</a></code></li>
+<li><code><a title="audiocraft.grids.compression" href="compression/index.html">audiocraft.grids.compression</a></code></li>
+<li><code><a title="audiocraft.grids.diffusion" href="diffusion/index.html">audiocraft.grids.diffusion</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen" href="musicgen/index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/index.html b/api_docs/audiocraft/grids/musicgen/index.html
new file mode 100644
index 00000000..347ff06a
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/index.html
@@ -0,0 +1,103 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen API documentation</title>
+<meta name="description" content="MusicGen grids." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen</code></h1>
+</header>
+<section id="section-intro">
+<p>MusicGen grids.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;MusicGen grids.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_base_32khz" href="musicgen_base_32khz.html">audiocraft.grids.musicgen.musicgen_base_32khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_base_cached_32khz" href="musicgen_base_cached_32khz.html">audiocraft.grids.musicgen.musicgen_base_cached_32khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_clapemb_32khz" href="musicgen_clapemb_32khz.html">audiocraft.grids.musicgen.musicgen_clapemb_32khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_melody_32khz" href="musicgen_melody_32khz.html">audiocraft.grids.musicgen.musicgen_melody_32khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval" href="musicgen_pretrained_32khz_eval.html">audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval</a></code></dt>
+<dd>
+<div class="desc"><p>Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz" href="musicgen_stereo_finetune_32khz.html">audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids" href="../index.html">audiocraft.grids</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_base_32khz" href="musicgen_base_32khz.html">audiocraft.grids.musicgen.musicgen_base_32khz</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_base_cached_32khz" href="musicgen_base_cached_32khz.html">audiocraft.grids.musicgen.musicgen_base_cached_32khz</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_clapemb_32khz" href="musicgen_clapemb_32khz.html">audiocraft.grids.musicgen.musicgen_clapemb_32khz</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_melody_32khz" href="musicgen_melody_32khz.html">audiocraft.grids.musicgen.musicgen_melody_32khz</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval" href="musicgen_pretrained_32khz_eval.html">audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval</a></code></li>
+<li><code><a title="audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz" href="musicgen_stereo_finetune_32khz.html">audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_base_32khz.html b/api_docs/audiocraft/grids/musicgen/musicgen_base_32khz.html
new file mode 100644
index 00000000..7c676c0c
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_base_32khz.html
@@ -0,0 +1,101 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_base_32khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_base_32khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver=&#39;musicgen/musicgen_base_32khz&#39;)
+    # replace this by the desired music dataset
+    launcher.bind_(dset=&#39;internal/music_400k_32khz&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    medium = {&#39;model/lm/model_scale&#39;: &#39;medium&#39;}
+    large = {&#39;model/lm/model_scale&#39;: &#39;large&#39;}
+
+    cfg_low = {&#39;classifier_free_guidance.training_dropout&#39;: 0.2}
+    wd_low = {&#39;conditioners.description.t5.word_dropout&#39;: 0.2}
+
+    adam = {&#39;optim.optimizer&#39;: &#39;adamw&#39;, &#39;optim.lr&#39;: 1e-4}
+
+    launcher.bind_(fsdp)
+
+    launcher.slurm_(gpus=32).bind_(label=&#39;32gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+
+    launcher.slurm_(gpus=64).bind_(label=&#39;64gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+
+    launcher.slurm_(gpus=96).bind_(label=&#39;96gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {&#39;optim.max_norm&#39;: 3})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_base_cached_32khz.html b/api_docs/audiocraft/grids/musicgen/musicgen_base_cached_32khz.html
new file mode 100644
index 00000000..e10b1cd8
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_base_cached_32khz.html
@@ -0,0 +1,125 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_base_cached_32khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_base_cached_32khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver=&#39;musicgen/musicgen_base_32khz&#39;)
+    # replace this by the desired music dataset
+    launcher.bind_(dset=&#39;internal/music_400k_32khz&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    medium = {&#39;model/lm/model_scale&#39;: &#39;medium&#39;}
+    large = {&#39;model/lm/model_scale&#39;: &#39;large&#39;}
+
+    cfg_low = {&#39;classifier_free_guidance.training_dropout&#39;: 0.2}
+    wd_low = {&#39;conditioners.description.t5.word_dropout&#39;: 0.2}
+
+    adam = {&#39;optim.optimizer&#39;: &#39;adamw&#39;, &#39;optim.lr&#39;: 1e-4}
+
+    # BEGINNING OF CACHE WRITING JOBS.
+    cache_write = {
+        &#39;cache.path&#39;: &#39;/fsx-codegen/defossez/cache/interleave_stereo_nv_32k&#39;,
+        &#39;cache.write&#39;: True,
+        &#39;generate.every&#39;: 500,
+        &#39;evaluate.every&#39;: 500,
+        &#39;logging.log_updates&#39;: 50,
+    }
+
+    cache_sub = launcher.bind({&#39;model/lm/model_scale&#39;: &#39;xsmall&#39;, &#39;conditioner&#39;: &#39;none&#39;})
+    cache_sub.bind_({&#39;deadlock.use&#39;: True})
+    cache_sub.slurm_(gpus=8)
+    with launcher.job_array():
+        num_shards = 10  # total number of jobs running in parallel.
+        for shard in range(0, num_shards):
+            launcher(cache_write, {&#39;cache.write_num_shards&#39;: num_shards, &#39;cache.write_shard&#39;: shard})
+
+    # REMOVE THE FOLLOWING RETURN STATEMENT ONCE THE ABOVE JOBS ARE DONE,
+    # OR SUFFICIENTLY AHEAD.
+    return
+
+    cache = {
+        &#39;cache.path&#39;: &#39;/fsx-codegen/defossez/cache/interleave_stereo_nv_32k&#39;,
+    }
+    launcher.bind_(fsdp, cache)
+
+    launcher.slurm_(gpus=32).bind_(label=&#39;32gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+
+    launcher.slurm_(gpus=64).bind_(label=&#39;64gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+
+    launcher.slurm_(gpus=96).bind_(label=&#39;96gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {&#39;optim.max_norm&#39;: 3})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_clapemb_32khz.html b/api_docs/audiocraft/grids/musicgen/musicgen_clapemb_32khz.html
new file mode 100644
index 00000000..2ad78dc8
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_clapemb_32khz.html
@@ -0,0 +1,90 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_clapemb_32khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_clapemb_32khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver=&#39;musicgen/musicgen_base_32khz&#39;)
+    # replace this by the desired music dataset
+    launcher.bind_(dset=&#39;internal/music_400k_32khz&#39;)
+    launcher.bind_(conditioner=&#39;clapemb2music&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    cache_path = {&#39;conditioners.description.clap.cache_path&#39;:
+                  &#39;/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/clap_embed_music&#39;}
+    text_wav_training_opt = {&#39;conditioners.description.clap.text_p&#39;: 0.5}
+
+    launcher.bind_(fsdp)
+
+    launcher.slurm_(gpus=32).bind_(label=&#39;32gpus&#39;)
+    with launcher.job_array():
+        launcher()
+        launcher(text_wav_training_opt)
+        launcher(cache_path)
+        launcher(cache_path, text_wav_training_opt)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_melody_32khz.html b/api_docs/audiocraft/grids/musicgen/musicgen_melody_32khz.html
new file mode 100644
index 00000000..e6601e89
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_melody_32khz.html
@@ -0,0 +1,123 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_melody_32khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_melody_32khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver=&#39;musicgen/musicgen_melody_32khz&#39;)
+    # replace this by the desired music dataset
+    launcher.bind_(dset=&#39;internal/music_400k_32khz&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    medium = {&#39;model/lm/model_scale&#39;: &#39;medium&#39;}
+    large = {&#39;model/lm/model_scale&#39;: &#39;large&#39;}
+
+    cfg_low = {&#39;classifier_free_guidance.training_dropout&#39;: 0.2}
+    wd_low = {&#39;conditioners.description.t5.word_dropout&#39;: 0.2}
+
+    adam = {&#39;optim.optimizer&#39;: &#39;adamw&#39;, &#39;optim.lr&#39;: 1e-4}
+
+    cache_path = {&#39;conditioners.self_wav.chroma_stem.cache_path&#39;:
+                  &#39;/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/chroma_stem&#39;}
+
+    # CACHE GENERATION JOBS
+    n_cache_gen_jobs = 4
+    gen_sub = launcher.slurm(gpus=1)
+    gen_sub.bind_(
+        cache_path, {
+            # the cache is always computed over the whole file, so duration doesn&#39;t matter here.
+            &#39;dataset.segment_duration&#39;: 2.,
+            &#39;dataset.batch_size&#39;: 8,
+            &#39;dataset.train.permutation_on_files&#39;: True,  # try to not repeat files.
+            &#39;optim.epochs&#39;: 10,
+            &#39;model/lm/model_scale&#39;: &#39;xsmall&#39;,
+
+        })
+    with gen_sub.job_array():
+        for gen_job in range(n_cache_gen_jobs):
+            gen_sub({&#39;dataset.train.shuffle_seed&#39;: gen_job})
+
+    # ACTUAL TRAINING JOBS.
+    launcher.bind_(fsdp)
+
+    launcher.slurm_(gpus=32).bind_(label=&#39;32gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+        sub(cache_path)
+
+    launcher.slurm_(gpus=64).bind_(label=&#39;64gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+
+    launcher.slurm_(gpus=96).bind_(label=&#39;96gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {&#39;optim.max_norm&#39;: 3})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_pretrained_32khz_eval.html b/api_docs/audiocraft/grids/musicgen/musicgen_pretrained_32khz_eval.html
new file mode 100644
index 00000000..c3c8a209
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_pretrained_32khz_eval.html
@@ -0,0 +1,218 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval API documentation</title>
+<meta name="description" content="Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval</code></h1>
+</header>
+<section id="section-intro">
+<p>Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.</p>
+<p>When running the grid for the first time, please use:
+REGEN=1 dora grid musicgen.musicgen_pretrained_32khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.</p>
+<p>Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+
+When running the grid for the first time, please use:
+REGEN=1 dora grid musicgen.musicgen_pretrained_32khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+&#34;&#34;&#34;
+
+import os
+
+from ._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+
+
+def eval(launcher, batch_size: int = 32, eval_melody: bool = False):
+    opts = {
+        &#39;dset&#39;: &#39;audio/musiccaps_32khz&#39;,
+        &#39;solver/musicgen/evaluation&#39;: &#39;objective_eval&#39;,
+        &#39;execute_only&#39;: &#39;evaluate&#39;,
+        &#39;+dataset.evaluate.batch_size&#39;: batch_size,
+        &#39;+metrics.fad.tf.batch_size&#39;: 16,
+    }
+    # chroma-specific evaluation
+    chroma_opts = {
+        &#39;dset&#39;: &#39;internal/music_400k_32khz&#39;,
+        &#39;dataset.evaluate.segment_duration&#39;: 30,
+        &#39;dataset.evaluate.num_samples&#39;: 1000,
+        &#39;evaluate.metrics.chroma_cosine&#39;: True,
+        &#39;evaluate.metrics.fad&#39;: False,
+        &#39;evaluate.metrics.kld&#39;: False,
+        &#39;evaluate.metrics.text_consistency&#39;: False,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        &#39;metrics.fad.tf.bin&#39;: &#39;/data/home/jadecopet/local/usr/opt/google-research&#39;
+    }
+    opt1 = {&#39;generate.lm.use_sampling&#39;: True, &#39;generate.lm.top_k&#39;: 250, &#39;generate.lm.top_p&#39;: 0.}
+    opt2 = {&#39;transformer_lm.two_step_cfg&#39;: True}
+
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+
+    # base objective metrics
+    sub(opt1, opt2)
+
+    if eval_melody:
+        # chroma-specific metrics
+        sub(opt1, opt2, chroma_opts)
+
+
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=4, partition=partitions)
+
+    if &#39;REGEN&#39; not in os.environ:
+        folder = train.main.dora.dir / &#39;grids&#39; / __name__.split(&#39;.&#39;, 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+
+    with launcher.job_array():
+        musicgen_base = launcher.bind(solver=&#34;musicgen/musicgen_base_32khz&#34;)
+        musicgen_base.bind_({&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True})
+
+        # base musicgen models
+        musicgen_base_small = musicgen_base.bind({&#39;continue_from&#39;: &#39;//pretrained/facebook/musicgen-small&#39;})
+        eval(musicgen_base_small, batch_size=128)
+
+        musicgen_base_medium = musicgen_base.bind({&#39;continue_from&#39;: &#39;//pretrained/facebook/musicgen-medium&#39;})
+        musicgen_base_medium.bind_({&#39;model/lm/model_scale&#39;: &#39;medium&#39;})
+        eval(musicgen_base_medium, batch_size=128)
+
+        musicgen_base_large = musicgen_base.bind({&#39;continue_from&#39;: &#39;//pretrained/facebook/musicgen-large&#39;})
+        musicgen_base_large.bind_({&#39;model/lm/model_scale&#39;: &#39;large&#39;})
+        eval(musicgen_base_large, batch_size=128)
+
+        # melody musicgen model
+        musicgen_melody = launcher.bind(solver=&#34;musicgen/musicgen_melody_32khz&#34;)
+        musicgen_melody.bind_({&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True})
+
+        musicgen_melody_medium = musicgen_melody.bind({&#39;continue_from&#39;: &#39;//pretrained/facebook/musicgen-melody&#39;})
+        musicgen_melody_medium.bind_({&#39;model/lm/model_scale&#39;: &#39;medium&#39;})
+        eval(musicgen_melody_medium, batch_size=128, eval_melody=True)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval.eval"><code class="name flex">
+<span>def <span class="ident">eval</span></span>(<span>launcher, batch_size: int = 32, eval_melody: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def eval(launcher, batch_size: int = 32, eval_melody: bool = False):
+    opts = {
+        &#39;dset&#39;: &#39;audio/musiccaps_32khz&#39;,
+        &#39;solver/musicgen/evaluation&#39;: &#39;objective_eval&#39;,
+        &#39;execute_only&#39;: &#39;evaluate&#39;,
+        &#39;+dataset.evaluate.batch_size&#39;: batch_size,
+        &#39;+metrics.fad.tf.batch_size&#39;: 16,
+    }
+    # chroma-specific evaluation
+    chroma_opts = {
+        &#39;dset&#39;: &#39;internal/music_400k_32khz&#39;,
+        &#39;dataset.evaluate.segment_duration&#39;: 30,
+        &#39;dataset.evaluate.num_samples&#39;: 1000,
+        &#39;evaluate.metrics.chroma_cosine&#39;: True,
+        &#39;evaluate.metrics.fad&#39;: False,
+        &#39;evaluate.metrics.kld&#39;: False,
+        &#39;evaluate.metrics.text_consistency&#39;: False,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        &#39;metrics.fad.tf.bin&#39;: &#39;/data/home/jadecopet/local/usr/opt/google-research&#39;
+    }
+    opt1 = {&#39;generate.lm.use_sampling&#39;: True, &#39;generate.lm.top_k&#39;: 250, &#39;generate.lm.top_p&#39;: 0.}
+    opt2 = {&#39;transformer_lm.two_step_cfg&#39;: True}
+
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+
+    # base objective metrics
+    sub(opt1, opt2)
+
+    if eval_melody:
+        # chroma-specific metrics
+        sub(opt1, opt2, chroma_opts)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval.eval" href="#audiocraft.grids.musicgen.musicgen_pretrained_32khz_eval.eval">eval</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.html b/api_docs/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.html
new file mode 100644
index 00000000..a997debe
--- /dev/null
+++ b/api_docs/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.html
@@ -0,0 +1,115 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.grids.musicgen.musicgen_stereo_finetune_32khz</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions([&#39;team&#39;, &#39;global&#39;])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver=&#39;musicgen/musicgen_base_32khz&#39;)
+    # replace this by the desired music dataset, which needs to be stereo
+    launcher.bind_(dset=&#39;audio/example&#39;)
+
+    fsdp = {&#39;autocast&#39;: False, &#39;fsdp.use&#39;: True}
+    medium = {&#39;model/lm/model_scale&#39;: &#39;medium&#39;}
+    large = {&#39;model/lm/model_scale&#39;: &#39;large&#39;}
+
+    cfg_low = {&#39;classifier_free_guidance.training_dropout&#39;: 0.2}
+    wd_low = {&#39;conditioners.description.t5.word_dropout&#39;: 0.2}
+
+    adam = {&#39;optim.optimizer&#39;: &#39;adamw&#39;, &#39;optim.lr&#39;: 1e-4}
+
+    stereo = {
+        &#39;codebooks_pattern.delay.delays&#39;: [0, 0, 1, 1, 2, 2, 3, 3],
+        &#39;transformer_lm.n_q&#39;: 8,
+        &#39;interleave_stereo_codebooks.use&#39;: True,
+        &#39;channels&#39;: 2,
+    }
+
+    # You must follow the instructions in docs/MUSICGEN.md about the creation
+    # of the proper fine tuning checkpoints. We will assume they are stored under
+    # ~/checkpoints/{mode_name}.
+
+    checkpoints = Path.home() / &#39;checkpoints&#39;
+
+    launcher.bind_(fsdp, stereo, {&#39;optim.epochs&#39;: 100})
+
+    launcher.slurm_(gpus=32).bind_(label=&#39;32gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind({&#39;continue_from&#39;: str(checkpoints / &#39;stereo_finetune_musicgen-small.th&#39;)})
+        sub()
+
+    launcher.slurm_(gpus=64).bind_(label=&#39;64gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind({&#39;continue_from&#39;: str(checkpoints / &#39;stereo_finetune_musicgen-medium.th&#39;)})
+        sub(medium, adam)
+
+    launcher.slurm_(gpus=96).bind_(label=&#39;96gpus&#39;)
+    with launcher.job_array():
+        sub = launcher.bind({&#39;continue_from&#39;: str(checkpoints / &#39;stereo_finetune_musicgen-large.th&#39;)})
+        sub(large, cfg_low, wd_low, adam, {&#39;optim.max_norm&#39;: 3})</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.grids.musicgen" href="index.html">audiocraft.grids.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/index.html b/api_docs/audiocraft/index.html
new file mode 100644
index 00000000..ffcb37ee
--- /dev/null
+++ b/api_docs/audiocraft/index.html
@@ -0,0 +1,172 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft API documentation</title>
+<meta name="description" content="AudioCraft is a general framework for training audio generative models.
+At the moment we provide the training code for: …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Package <code>audiocraft</code></h1>
+</header>
+<section id="section-intro">
+<p>AudioCraft is a general framework for training audio generative models.
+At the moment we provide the training code for:</p>
+<ul>
+<li><a href="https://arxiv.org/abs/2306.05284">MusicGen</a>, a state-of-the-art
+text-to-music and melody+text autoregressive generative model.
+For the solver, see <code><a title="audiocraft.solvers.musicgen.MusicGenSolver" href="solvers/musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver">MusicGenSolver</a></code>, and for the model,
+<code><a title="audiocraft.models.musicgen.MusicGen" href="models/musicgen.html#audiocraft.models.musicgen.MusicGen">MusicGen</a></code>.</li>
+<li><a href="https://arxiv.org/abs/2209.15352">AudioGen</a>, a state-of-the-art
+text-to-general-audio generative model.</li>
+<li><a href="https://arxiv.org/abs/2210.13438">EnCodec</a>, efficient and high fidelity
+neural audio codec which provides an excellent tokenizer for autoregressive language models.
+See <code><a title="audiocraft.solvers.compression.CompressionSolver" href="solvers/compression.html#audiocraft.solvers.compression.CompressionSolver">CompressionSolver</a></code>, and <code><a title="audiocraft.models.encodec.EncodecModel" href="models/encodec.html#audiocraft.models.encodec.EncodecModel">EncodecModel</a></code>.</li>
+<li><a href="TODO">MultiBandDiffusion</a>, alternative diffusion-based decoder compatible with EnCodec that
+improves the perceived quality and reduces the artifacts coming from adversarial decoders.</li>
+</ul>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;
+AudioCraft is a general framework for training audio generative models.
+At the moment we provide the training code for:
+
+- [MusicGen](https://arxiv.org/abs/2306.05284), a state-of-the-art
+    text-to-music and melody+text autoregressive generative model.
+    For the solver, see `audiocraft.solvers.musicgen.MusicGenSolver`, and for the model,
+    `audiocraft.models.musicgen.MusicGen`.
+- [AudioGen](https://arxiv.org/abs/2209.15352), a state-of-the-art
+    text-to-general-audio generative model.
+- [EnCodec](https://arxiv.org/abs/2210.13438), efficient and high fidelity
+    neural audio codec which provides an excellent tokenizer for autoregressive language models.
+    See `audiocraft.solvers.compression.CompressionSolver`, and `audiocraft.models.encodec.EncodecModel`.
+- [MultiBandDiffusion](TODO), alternative diffusion-based decoder compatible with EnCodec that
+    improves the perceived quality and reduces the artifacts coming from adversarial decoders.
+&#34;&#34;&#34;
+
+# flake8: noqa
+from . import data, modules, models
+
+__version__ = &#39;1.2.0a2&#39;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.adversarial" href="adversarial/index.html">audiocraft.adversarial</a></code></dt>
+<dd>
+<div class="desc"><p>Adversarial losses and discriminator architectures.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data" href="data/index.html">audiocraft.data</a></code></dt>
+<dd>
+<div class="desc"><p>Audio loading and writing support. Datasets for raw audio
+or also including some metadata.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.environment" href="environment.html">audiocraft.environment</a></code></dt>
+<dd>
+<div class="desc"><p>Provides cluster and tools configuration across clusters (slurm, dora, utilities).</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.grids" href="grids/index.html">audiocraft.grids</a></code></dt>
+<dd>
+<div class="desc"><p>Dora Grids.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.losses" href="losses/index.html">audiocraft.losses</a></code></dt>
+<dd>
+<div class="desc"><p>Loss related classes and functions. In particular the loss balancer from
+EnCodec, and the usual spectral losses.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics" href="metrics/index.html">audiocraft.metrics</a></code></dt>
+<dd>
+<div class="desc"><p>Metrics like CLAP score, FAD, KLD, Visqol, Chroma similarity, etc.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models" href="models/index.html">audiocraft.models</a></code></dt>
+<dd>
+<div class="desc"><p>Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules" href="modules/index.html">audiocraft.modules</a></code></dt>
+<dd>
+<div class="desc"><p>Modules used for building the models.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim" href="optim/index.html">audiocraft.optim</a></code></dt>
+<dd>
+<div class="desc"><p>Optimization stuff. In particular, optimizers (DAdaptAdam), schedulers
+and Exponential Moving Average.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization" href="quantization/index.html">audiocraft.quantization</a></code></dt>
+<dd>
+<div class="desc"><p>RVQ.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers" href="solvers/index.html">audiocraft.solvers</a></code></dt>
+<dd>
+<div class="desc"><p>Solvers. A Solver is a training recipe, combining the dataloaders, models,
+optimizer, losses etc into a single convenient object.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.train" href="train.html">audiocraft.train</a></code></dt>
+<dd>
+<div class="desc"><p>Entry point for dora to launch solvers for running training loops.
+See more info on how to use dora: <a href="https://github.com/facebookresearch/dora">https://github.com/facebookresearch/dora</a></p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils" href="utils/index.html">audiocraft.utils</a></code></dt>
+<dd>
+<div class="desc"><p>Utilities.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.adversarial" href="adversarial/index.html">audiocraft.adversarial</a></code></li>
+<li><code><a title="audiocraft.data" href="data/index.html">audiocraft.data</a></code></li>
+<li><code><a title="audiocraft.environment" href="environment.html">audiocraft.environment</a></code></li>
+<li><code><a title="audiocraft.grids" href="grids/index.html">audiocraft.grids</a></code></li>
+<li><code><a title="audiocraft.losses" href="losses/index.html">audiocraft.losses</a></code></li>
+<li><code><a title="audiocraft.metrics" href="metrics/index.html">audiocraft.metrics</a></code></li>
+<li><code><a title="audiocraft.models" href="models/index.html">audiocraft.models</a></code></li>
+<li><code><a title="audiocraft.modules" href="modules/index.html">audiocraft.modules</a></code></li>
+<li><code><a title="audiocraft.optim" href="optim/index.html">audiocraft.optim</a></code></li>
+<li><code><a title="audiocraft.quantization" href="quantization/index.html">audiocraft.quantization</a></code></li>
+<li><code><a title="audiocraft.solvers" href="solvers/index.html">audiocraft.solvers</a></code></li>
+<li><code><a title="audiocraft.train" href="train.html">audiocraft.train</a></code></li>
+<li><code><a title="audiocraft.utils" href="utils/index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/losses/balancer.html b/api_docs/audiocraft/losses/balancer.html
new file mode 100644
index 00000000..2d6685a3
--- /dev/null
+++ b/api_docs/audiocraft/losses/balancer.html
@@ -0,0 +1,491 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.losses.balancer API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.losses.balancer</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import flashy
+import torch
+from torch import autograd
+
+
+class Balancer:
+    &#34;&#34;&#34;Loss balancer.
+
+    The loss balancer combines losses together to compute gradients for the backward.
+    Given `y = f(...)`, and a number of losses `l1(y, ...)`, `l2(y, ...)`, with `...`
+    not having any dependence on `f`, the balancer can efficiently normalize the partial gradients
+    `d l1 / d y`, `d l2 / dy` before summing them in order to achieve a desired ratio between
+    the losses. For instance if `weights = {&#39;l1&#39;: 2, &#39;l2&#39;: 1}`, 66% of the gradient
+    going into `f(...)` will come from `l1` on average, and 33% from `l2`. This allows for an easy
+    interpration of the weights even if the intrisic scale of `l1`, `l2` ... is unknown.
+
+    Noting `g1 = d l1 / dy`, etc., the balanced gradient `G` will be
+    (with `avg` an exponential moving average over the updates),
+
+        G = sum_i total_norm * g_i / avg(||g_i||) * w_i / sum(w_i)
+
+    If `balance_grads` is False, this is deactivated, and instead the gradient will just be the
+    standard sum of the partial gradients with the given weights.
+
+    A call to the backward method of the balancer will compute the the partial gradients,
+    combining all the losses and potentially rescaling the gradients,
+    which can help stabilize the training and reason about multiple losses with varying scales.
+    The obtained gradient with respect to `y` is then back-propagated to `f(...)`.
+
+    Expected usage:
+
+        weights = {&#39;loss_a&#39;: 1, &#39;loss_b&#39;: 4}
+        balancer = Balancer(weights, ...)
+        losses: dict = {}
+        losses[&#39;loss_a&#39;] = compute_loss_a(x, y)
+        losses[&#39;loss_b&#39;] = compute_loss_b(x, y)
+        if model.training():
+            effective_loss = balancer.backward(losses, x)
+
+    Args:
+        weights (dict[str, float]): Weight coefficient for each loss. The balancer expect the losses keys
+            from the backward method to match the weights keys to assign weight to each of the provided loss.
+        balance_grads (bool): Whether to rescale gradients so that weights reflect the fraction of the
+            overall gradient, rather than a constant multiplier.
+        total_norm (float): Reference norm when rescaling gradients, ignored otherwise.
+        emay_decay (float): EMA decay for averaging the norms.
+        per_batch_item (bool): Whether to compute the averaged norm per batch item or not. This only holds
+            when rescaling the gradients.
+        epsilon (float): Epsilon value for numerical stability.
+        monitor (bool): If True, stores in `self.metrics` the relative ratio between the norm of the gradients
+            coming from each loss, when calling `backward()`.
+    &#34;&#34;&#34;
+    def __init__(self, weights: tp.Dict[str, float], balance_grads: bool = True, total_norm: float = 1.,
+                 ema_decay: float = 0.999, per_batch_item: bool = True, epsilon: float = 1e-12,
+                 monitor: bool = False):
+        self.weights = weights
+        self.per_batch_item = per_batch_item
+        self.total_norm = total_norm or 1.
+        self.averager = flashy.averager(ema_decay or 1.)
+        self.epsilon = epsilon
+        self.monitor = monitor
+        self.balance_grads = balance_grads
+        self._metrics: tp.Dict[str, tp.Any] = {}
+
+    @property
+    def metrics(self):
+        return self._metrics
+
+    def backward(self, losses: tp.Dict[str, torch.Tensor], input: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute the backward and return the effective train loss, e.g. the loss obtained from
+        computing the effective weights. If `balance_grads` is True, the effective weights
+        are the one that needs to be applied to each gradient to respect the desired relative
+        scale of gradients coming from each loss.
+
+        Args:
+            losses (Dict[str, torch.Tensor]): dictionary with the same keys as `self.weights`.
+            input (torch.Tensor): the input of the losses, typically the output of the model.
+                This should be the single point of dependence between the losses
+                and the model being trained.
+        &#34;&#34;&#34;
+        norms = {}
+        grads = {}
+        for name, loss in losses.items():
+            # Compute partial derivative of the less with respect to the input.
+            grad, = autograd.grad(loss, [input], retain_graph=True)
+            if self.per_batch_item:
+                # We do not average the gradient over the batch dimension.
+                dims = tuple(range(1, grad.dim()))
+                norm = grad.norm(dim=dims, p=2).mean()
+            else:
+                norm = grad.norm(p=2)
+            norms[name] = norm
+            grads[name] = grad
+
+        count = 1
+        if self.per_batch_item:
+            count = len(grad)
+        # Average norms across workers. Theoretically we should average the
+        # squared norm, then take the sqrt, but it worked fine like that.
+        avg_norms = flashy.distrib.average_metrics(self.averager(norms), count)
+        # We approximate the total norm of the gradient as the sums of the norms.
+        # Obviously this can be very incorrect if all gradients are aligned, but it works fine.
+        total = sum(avg_norms.values())
+
+        self._metrics = {}
+        if self.monitor:
+            # Store the ratio of the total gradient represented by each loss.
+            for k, v in avg_norms.items():
+                self._metrics[f&#39;ratio_{k}&#39;] = v / total
+
+        total_weights = sum([self.weights[k] for k in avg_norms])
+        assert total_weights &gt; 0.
+        desired_ratios = {k: w / total_weights for k, w in self.weights.items()}
+
+        out_grad = torch.zeros_like(input)
+        effective_loss = torch.tensor(0., device=input.device, dtype=input.dtype)
+        for name, avg_norm in avg_norms.items():
+            if self.balance_grads:
+                # g_balanced = g / avg(||g||) * total_norm * desired_ratio
+                scale = desired_ratios[name] * self.total_norm / (self.epsilon + avg_norm)
+            else:
+                # We just do regular weighted sum of the gradients.
+                scale = self.weights[name]
+            out_grad.add_(grads[name], alpha=scale)
+            effective_loss += scale * losses[name].detach()
+        # Send the computed partial derivative with respect to the output of the model to the model.
+        input.backward(out_grad)
+        return effective_loss</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.losses.balancer.Balancer"><code class="flex name class">
+<span>class <span class="ident">Balancer</span></span>
+<span>(</span><span>weights: Dict[str, float], balance_grads: bool = True, total_norm: float = 1.0, ema_decay: float = 0.999, per_batch_item: bool = True, epsilon: float = 1e-12, monitor: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Loss balancer.</p>
+<p>The loss balancer combines losses together to compute gradients for the backward.
+Given <code>y = f(...)</code>, and a number of losses <code>l1(y, &hellip;)</code>, <code>l2(y, &hellip;)</code>, with <code>&hellip;</code>
+not having any dependence on <code>f</code>, the balancer can efficiently normalize the partial gradients
+<code>d l1 / d y</code>, <code>d l2 / dy</code> before summing them in order to achieve a desired ratio between
+the losses. For instance if <code>weights = {'l1': 2, 'l2': 1}</code>, 66% of the gradient
+going into <code>f(&hellip;)</code> will come from <code>l1</code> on average, and 33% from <code>l2</code>. This allows for an easy
+interpration of the weights even if the intrisic scale of <code>l1</code>, <code>l2</code> &hellip; is unknown.</p>
+<p>Noting <code>g1 = d l1 / dy</code>, etc., the balanced gradient <code>G</code> will be
+(with <code>avg</code> an exponential moving average over the updates),</p>
+<pre><code>G = sum_i total_norm * g_i / avg(||g_i||) * w_i / sum(w_i)
+</code></pre>
+<p>If <code>balance_grads</code> is False, this is deactivated, and instead the gradient will just be the
+standard sum of the partial gradients with the given weights.</p>
+<p>A call to the backward method of the balancer will compute the the partial gradients,
+combining all the losses and potentially rescaling the gradients,
+which can help stabilize the training and reason about multiple losses with varying scales.
+The obtained gradient with respect to <code>y</code> is then back-propagated to <code>f(&hellip;)</code>.</p>
+<p>Expected usage:</p>
+<pre><code>weights = {'loss_a': 1, 'loss_b': 4}
+balancer = Balancer(weights, ...)
+losses: dict = {}
+losses['loss_a'] = compute_loss_a(x, y)
+losses['loss_b'] = compute_loss_b(x, y)
+if model.training():
+    effective_loss = balancer.backward(losses, x)
+</code></pre>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>weights</code></strong> :&ensp;<code>dict[str, float]</code></dt>
+<dd>Weight coefficient for each loss. The balancer expect the losses keys
+from the backward method to match the weights keys to assign weight to each of the provided loss.</dd>
+<dt><strong><code>balance_grads</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to rescale gradients so that weights reflect the fraction of the
+overall gradient, rather than a constant multiplier.</dd>
+<dt><strong><code>total_norm</code></strong> :&ensp;<code>float</code></dt>
+<dd>Reference norm when rescaling gradients, ignored otherwise.</dd>
+<dt><strong><code>emay_decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>EMA decay for averaging the norms.</dd>
+<dt><strong><code>per_batch_item</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to compute the averaged norm per batch item or not. This only holds
+when rescaling the gradients.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+<dt><strong><code>monitor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, stores in <code>self.metrics</code> the relative ratio between the norm of the gradients
+coming from each loss, when calling <code>backward()</code>.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Balancer:
+    &#34;&#34;&#34;Loss balancer.
+
+    The loss balancer combines losses together to compute gradients for the backward.
+    Given `y = f(...)`, and a number of losses `l1(y, ...)`, `l2(y, ...)`, with `...`
+    not having any dependence on `f`, the balancer can efficiently normalize the partial gradients
+    `d l1 / d y`, `d l2 / dy` before summing them in order to achieve a desired ratio between
+    the losses. For instance if `weights = {&#39;l1&#39;: 2, &#39;l2&#39;: 1}`, 66% of the gradient
+    going into `f(...)` will come from `l1` on average, and 33% from `l2`. This allows for an easy
+    interpration of the weights even if the intrisic scale of `l1`, `l2` ... is unknown.
+
+    Noting `g1 = d l1 / dy`, etc., the balanced gradient `G` will be
+    (with `avg` an exponential moving average over the updates),
+
+        G = sum_i total_norm * g_i / avg(||g_i||) * w_i / sum(w_i)
+
+    If `balance_grads` is False, this is deactivated, and instead the gradient will just be the
+    standard sum of the partial gradients with the given weights.
+
+    A call to the backward method of the balancer will compute the the partial gradients,
+    combining all the losses and potentially rescaling the gradients,
+    which can help stabilize the training and reason about multiple losses with varying scales.
+    The obtained gradient with respect to `y` is then back-propagated to `f(...)`.
+
+    Expected usage:
+
+        weights = {&#39;loss_a&#39;: 1, &#39;loss_b&#39;: 4}
+        balancer = Balancer(weights, ...)
+        losses: dict = {}
+        losses[&#39;loss_a&#39;] = compute_loss_a(x, y)
+        losses[&#39;loss_b&#39;] = compute_loss_b(x, y)
+        if model.training():
+            effective_loss = balancer.backward(losses, x)
+
+    Args:
+        weights (dict[str, float]): Weight coefficient for each loss. The balancer expect the losses keys
+            from the backward method to match the weights keys to assign weight to each of the provided loss.
+        balance_grads (bool): Whether to rescale gradients so that weights reflect the fraction of the
+            overall gradient, rather than a constant multiplier.
+        total_norm (float): Reference norm when rescaling gradients, ignored otherwise.
+        emay_decay (float): EMA decay for averaging the norms.
+        per_batch_item (bool): Whether to compute the averaged norm per batch item or not. This only holds
+            when rescaling the gradients.
+        epsilon (float): Epsilon value for numerical stability.
+        monitor (bool): If True, stores in `self.metrics` the relative ratio between the norm of the gradients
+            coming from each loss, when calling `backward()`.
+    &#34;&#34;&#34;
+    def __init__(self, weights: tp.Dict[str, float], balance_grads: bool = True, total_norm: float = 1.,
+                 ema_decay: float = 0.999, per_batch_item: bool = True, epsilon: float = 1e-12,
+                 monitor: bool = False):
+        self.weights = weights
+        self.per_batch_item = per_batch_item
+        self.total_norm = total_norm or 1.
+        self.averager = flashy.averager(ema_decay or 1.)
+        self.epsilon = epsilon
+        self.monitor = monitor
+        self.balance_grads = balance_grads
+        self._metrics: tp.Dict[str, tp.Any] = {}
+
+    @property
+    def metrics(self):
+        return self._metrics
+
+    def backward(self, losses: tp.Dict[str, torch.Tensor], input: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute the backward and return the effective train loss, e.g. the loss obtained from
+        computing the effective weights. If `balance_grads` is True, the effective weights
+        are the one that needs to be applied to each gradient to respect the desired relative
+        scale of gradients coming from each loss.
+
+        Args:
+            losses (Dict[str, torch.Tensor]): dictionary with the same keys as `self.weights`.
+            input (torch.Tensor): the input of the losses, typically the output of the model.
+                This should be the single point of dependence between the losses
+                and the model being trained.
+        &#34;&#34;&#34;
+        norms = {}
+        grads = {}
+        for name, loss in losses.items():
+            # Compute partial derivative of the less with respect to the input.
+            grad, = autograd.grad(loss, [input], retain_graph=True)
+            if self.per_batch_item:
+                # We do not average the gradient over the batch dimension.
+                dims = tuple(range(1, grad.dim()))
+                norm = grad.norm(dim=dims, p=2).mean()
+            else:
+                norm = grad.norm(p=2)
+            norms[name] = norm
+            grads[name] = grad
+
+        count = 1
+        if self.per_batch_item:
+            count = len(grad)
+        # Average norms across workers. Theoretically we should average the
+        # squared norm, then take the sqrt, but it worked fine like that.
+        avg_norms = flashy.distrib.average_metrics(self.averager(norms), count)
+        # We approximate the total norm of the gradient as the sums of the norms.
+        # Obviously this can be very incorrect if all gradients are aligned, but it works fine.
+        total = sum(avg_norms.values())
+
+        self._metrics = {}
+        if self.monitor:
+            # Store the ratio of the total gradient represented by each loss.
+            for k, v in avg_norms.items():
+                self._metrics[f&#39;ratio_{k}&#39;] = v / total
+
+        total_weights = sum([self.weights[k] for k in avg_norms])
+        assert total_weights &gt; 0.
+        desired_ratios = {k: w / total_weights for k, w in self.weights.items()}
+
+        out_grad = torch.zeros_like(input)
+        effective_loss = torch.tensor(0., device=input.device, dtype=input.dtype)
+        for name, avg_norm in avg_norms.items():
+            if self.balance_grads:
+                # g_balanced = g / avg(||g||) * total_norm * desired_ratio
+                scale = desired_ratios[name] * self.total_norm / (self.epsilon + avg_norm)
+            else:
+                # We just do regular weighted sum of the gradients.
+                scale = self.weights[name]
+            out_grad.add_(grads[name], alpha=scale)
+            effective_loss += scale * losses[name].detach()
+        # Send the computed partial derivative with respect to the output of the model to the model.
+        input.backward(out_grad)
+        return effective_loss</code></pre>
+</details>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.losses.balancer.Balancer.metrics"><code class="name">var <span class="ident">metrics</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def metrics(self):
+    return self._metrics</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.balancer.Balancer.backward"><code class="name flex">
+<span>def <span class="ident">backward</span></span>(<span>self, losses: Dict[str, torch.Tensor], input: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute the backward and return the effective train loss, e.g. the loss obtained from
+computing the effective weights. If <code>balance_grads</code> is True, the effective weights
+are the one that needs to be applied to each gradient to respect the desired relative
+scale of gradients coming from each loss.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>losses</code></strong> :&ensp;<code>Dict[str, torch.Tensor]</code></dt>
+<dd>dictionary with the same keys as <code>self.weights</code>.</dd>
+<dt><strong><code>input</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>the input of the losses, typically the output of the model.
+This should be the single point of dependence between the losses
+and the model being trained.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def backward(self, losses: tp.Dict[str, torch.Tensor], input: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Compute the backward and return the effective train loss, e.g. the loss obtained from
+    computing the effective weights. If `balance_grads` is True, the effective weights
+    are the one that needs to be applied to each gradient to respect the desired relative
+    scale of gradients coming from each loss.
+
+    Args:
+        losses (Dict[str, torch.Tensor]): dictionary with the same keys as `self.weights`.
+        input (torch.Tensor): the input of the losses, typically the output of the model.
+            This should be the single point of dependence between the losses
+            and the model being trained.
+    &#34;&#34;&#34;
+    norms = {}
+    grads = {}
+    for name, loss in losses.items():
+        # Compute partial derivative of the less with respect to the input.
+        grad, = autograd.grad(loss, [input], retain_graph=True)
+        if self.per_batch_item:
+            # We do not average the gradient over the batch dimension.
+            dims = tuple(range(1, grad.dim()))
+            norm = grad.norm(dim=dims, p=2).mean()
+        else:
+            norm = grad.norm(p=2)
+        norms[name] = norm
+        grads[name] = grad
+
+    count = 1
+    if self.per_batch_item:
+        count = len(grad)
+    # Average norms across workers. Theoretically we should average the
+    # squared norm, then take the sqrt, but it worked fine like that.
+    avg_norms = flashy.distrib.average_metrics(self.averager(norms), count)
+    # We approximate the total norm of the gradient as the sums of the norms.
+    # Obviously this can be very incorrect if all gradients are aligned, but it works fine.
+    total = sum(avg_norms.values())
+
+    self._metrics = {}
+    if self.monitor:
+        # Store the ratio of the total gradient represented by each loss.
+        for k, v in avg_norms.items():
+            self._metrics[f&#39;ratio_{k}&#39;] = v / total
+
+    total_weights = sum([self.weights[k] for k in avg_norms])
+    assert total_weights &gt; 0.
+    desired_ratios = {k: w / total_weights for k, w in self.weights.items()}
+
+    out_grad = torch.zeros_like(input)
+    effective_loss = torch.tensor(0., device=input.device, dtype=input.dtype)
+    for name, avg_norm in avg_norms.items():
+        if self.balance_grads:
+            # g_balanced = g / avg(||g||) * total_norm * desired_ratio
+            scale = desired_ratios[name] * self.total_norm / (self.epsilon + avg_norm)
+        else:
+            # We just do regular weighted sum of the gradients.
+            scale = self.weights[name]
+        out_grad.add_(grads[name], alpha=scale)
+        effective_loss += scale * losses[name].detach()
+    # Send the computed partial derivative with respect to the output of the model to the model.
+    input.backward(out_grad)
+    return effective_loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.losses" href="index.html">audiocraft.losses</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.losses.balancer.Balancer" href="#audiocraft.losses.balancer.Balancer">Balancer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.balancer.Balancer.backward" href="#audiocraft.losses.balancer.Balancer.backward">backward</a></code></li>
+<li><code><a title="audiocraft.losses.balancer.Balancer.metrics" href="#audiocraft.losses.balancer.Balancer.metrics">metrics</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/losses/index.html b/api_docs/audiocraft/losses/index.html
new file mode 100644
index 00000000..0a4e32ae
--- /dev/null
+++ b/api_docs/audiocraft/losses/index.html
@@ -0,0 +1,109 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.losses API documentation</title>
+<meta name="description" content="Loss related classes and functions. In particular the loss balancer from
+EnCodec, and the usual spectral losses." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.losses</code></h1>
+</header>
+<section id="section-intro">
+<p>Loss related classes and functions. In particular the loss balancer from
+EnCodec, and the usual spectral losses.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Loss related classes and functions. In particular the loss balancer from
+EnCodec, and the usual spectral losses.&#34;&#34;&#34;
+
+# flake8: noqa
+from .balancer import Balancer
+from .sisnr import SISNR
+from .stftloss import (
+    LogSTFTMagnitudeLoss,
+    MRSTFTLoss,
+    SpectralConvergenceLoss,
+    STFTLoss
+)
+from .specloss import (
+    MelSpectrogramL1Loss,
+    MultiScaleMelSpectrogramLoss,
+)</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.losses.balancer" href="balancer.html">audiocraft.losses.balancer</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.losses.sisnr" href="sisnr.html">audiocraft.losses.sisnr</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.losses.specloss" href="specloss.html">audiocraft.losses.specloss</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.losses.stftloss" href="stftloss.html">audiocraft.losses.stftloss</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.losses.balancer" href="balancer.html">audiocraft.losses.balancer</a></code></li>
+<li><code><a title="audiocraft.losses.sisnr" href="sisnr.html">audiocraft.losses.sisnr</a></code></li>
+<li><code><a title="audiocraft.losses.specloss" href="specloss.html">audiocraft.losses.specloss</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss" href="stftloss.html">audiocraft.losses.stftloss</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/losses/sisnr.html b/api_docs/audiocraft/losses/sisnr.html
new file mode 100644
index 00000000..b1e2be25
--- /dev/null
+++ b/api_docs/audiocraft/losses/sisnr.html
@@ -0,0 +1,332 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.losses.sisnr API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.losses.sisnr</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import typing as tp
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def _unfold(a: torch.Tensor, kernel_size: int, stride: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    &#34;&#34;&#34;
+    *shape, length = a.shape
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(a, (0, tgt_length - length))
+    strides = list(a.stride())
+    assert strides[-1] == 1, &#34;data should be contiguous&#34;
+    strides = strides[:-1] + [stride, 1]
+    return a.as_strided([*shape, n_frames, kernel_size], strides)
+
+
+def _center(x: torch.Tensor) -&gt; torch.Tensor:
+    return x - x.mean(-1, True)
+
+
+def _norm2(x: torch.Tensor) -&gt; torch.Tensor:
+    return x.pow(2).sum(-1, True)
+
+
+class SISNR(nn.Module):
+    &#34;&#34;&#34;SISNR loss.
+
+    Input should be [B, C, T], output is scalar.
+
+    ..Warning:: This function returns the opposite of the SI-SNR (e.g. `-1 * regular_SI_SNR`).
+        Consequently, lower scores are better in terms of reconstruction quality,
+        in particular, it should be negative if training goes well. This done this way so
+        that this module can also be used as a loss function for training model.
+
+    Args:
+        sample_rate (int): Sample rate.
+        segment (float or None): Evaluate on chunks of that many seconds. If None, evaluate on
+            entire audio only.
+        overlap (float): Overlap between chunks, i.e. 0.5 = 50 % overlap.
+        epsilon (float): Epsilon value for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        segment: tp.Optional[float] = 20,
+        overlap: float = 0.5,
+        epsilon: float = torch.finfo(torch.float32).eps,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.segment = segment
+        self.overlap = overlap
+        self.epsilon = epsilon
+
+    def forward(self, out_sig: torch.Tensor, ref_sig: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = ref_sig.shape
+        assert ref_sig.shape == out_sig.shape
+
+        if self.segment is None:
+            frame = T
+            stride = T
+        else:
+            frame = int(self.segment * self.sample_rate)
+            stride = int(frame * (1 - self.overlap))
+
+        epsilon = self.epsilon * frame  # make epsilon prop to frame size.
+
+        gt = _unfold(ref_sig, frame, stride)
+        est = _unfold(out_sig, frame, stride)
+        if self.segment is None:
+            assert gt.shape[-1] == 1
+
+        gt = _center(gt)
+        est = _center(est)
+        dot = torch.einsum(&#34;bcft,bcft-&gt;bcf&#34;, gt, est)
+
+        proj = dot[:, :, :, None] * gt / (epsilon + _norm2(gt))
+        noise = est - proj
+
+        sisnr = 10 * (
+            torch.log10(epsilon + _norm2(proj)) - torch.log10(epsilon + _norm2(noise))
+        )
+        return -1 * sisnr[..., 0].mean()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.losses.sisnr.SISNR"><code class="flex name class">
+<span>class <span class="ident">SISNR</span></span>
+<span>(</span><span>sample_rate: int = 16000, segment: Optional[float] = 20, overlap: float = 0.5, epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SISNR loss.</p>
+<p>Input should be [B, C, T], output is scalar.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;This function returns the opposite of the SI-SNR (e.g. <code>-1 * regular_SI_SNR</code>).</p>
+<p>Consequently, lower scores are better in terms of reconstruction quality,
+in particular, it should be negative if training goes well. This done this way so
+that this module can also be used as a loss function for training model.</p>
+</div>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>segment</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Evaluate on chunks of that many seconds. If None, evaluate on
+entire audio only.</dd>
+<dt><strong><code>overlap</code></strong> :&ensp;<code>float</code></dt>
+<dd>Overlap between chunks, i.e. 0.5 = 50 % overlap.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SISNR(nn.Module):
+    &#34;&#34;&#34;SISNR loss.
+
+    Input should be [B, C, T], output is scalar.
+
+    ..Warning:: This function returns the opposite of the SI-SNR (e.g. `-1 * regular_SI_SNR`).
+        Consequently, lower scores are better in terms of reconstruction quality,
+        in particular, it should be negative if training goes well. This done this way so
+        that this module can also be used as a loss function for training model.
+
+    Args:
+        sample_rate (int): Sample rate.
+        segment (float or None): Evaluate on chunks of that many seconds. If None, evaluate on
+            entire audio only.
+        overlap (float): Overlap between chunks, i.e. 0.5 = 50 % overlap.
+        epsilon (float): Epsilon value for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        segment: tp.Optional[float] = 20,
+        overlap: float = 0.5,
+        epsilon: float = torch.finfo(torch.float32).eps,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.segment = segment
+        self.overlap = overlap
+        self.epsilon = epsilon
+
+    def forward(self, out_sig: torch.Tensor, ref_sig: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = ref_sig.shape
+        assert ref_sig.shape == out_sig.shape
+
+        if self.segment is None:
+            frame = T
+            stride = T
+        else:
+            frame = int(self.segment * self.sample_rate)
+            stride = int(frame * (1 - self.overlap))
+
+        epsilon = self.epsilon * frame  # make epsilon prop to frame size.
+
+        gt = _unfold(ref_sig, frame, stride)
+        est = _unfold(out_sig, frame, stride)
+        if self.segment is None:
+            assert gt.shape[-1] == 1
+
+        gt = _center(gt)
+        est = _center(est)
+        dot = torch.einsum(&#34;bcft,bcft-&gt;bcf&#34;, gt, est)
+
+        proj = dot[:, :, :, None] * gt / (epsilon + _norm2(gt))
+        noise = est - proj
+
+        sisnr = 10 * (
+            torch.log10(epsilon + _norm2(proj)) - torch.log10(epsilon + _norm2(noise))
+        )
+        return -1 * sisnr[..., 0].mean()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.sisnr.SISNR.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.sisnr.SISNR.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.sisnr.SISNR.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.sisnr.SISNR.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, out_sig: torch.Tensor, ref_sig: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, out_sig: torch.Tensor, ref_sig: torch.Tensor) -&gt; torch.Tensor:
+    B, C, T = ref_sig.shape
+    assert ref_sig.shape == out_sig.shape
+
+    if self.segment is None:
+        frame = T
+        stride = T
+    else:
+        frame = int(self.segment * self.sample_rate)
+        stride = int(frame * (1 - self.overlap))
+
+    epsilon = self.epsilon * frame  # make epsilon prop to frame size.
+
+    gt = _unfold(ref_sig, frame, stride)
+    est = _unfold(out_sig, frame, stride)
+    if self.segment is None:
+        assert gt.shape[-1] == 1
+
+    gt = _center(gt)
+    est = _center(est)
+    dot = torch.einsum(&#34;bcft,bcft-&gt;bcf&#34;, gt, est)
+
+    proj = dot[:, :, :, None] * gt / (epsilon + _norm2(gt))
+    noise = est - proj
+
+    sisnr = 10 * (
+        torch.log10(epsilon + _norm2(proj)) - torch.log10(epsilon + _norm2(noise))
+    )
+    return -1 * sisnr[..., 0].mean()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.losses" href="index.html">audiocraft.losses</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.losses.sisnr.SISNR" href="#audiocraft.losses.sisnr.SISNR">SISNR</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.sisnr.SISNR.call_super_init" href="#audiocraft.losses.sisnr.SISNR.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.sisnr.SISNR.dump_patches" href="#audiocraft.losses.sisnr.SISNR.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.sisnr.SISNR.forward" href="#audiocraft.losses.sisnr.SISNR.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.sisnr.SISNR.training" href="#audiocraft.losses.sisnr.SISNR.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/losses/specloss.html b/api_docs/audiocraft/losses/specloss.html
new file mode 100644
index 00000000..b347f9c2
--- /dev/null
+++ b/api_docs/audiocraft/losses/specloss.html
@@ -0,0 +1,634 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.losses.specloss API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.losses.specloss</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import numpy as np
+from torchaudio.transforms import MelSpectrogram
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..modules import pad_for_conv1d
+
+
+class MelSpectrogramWrapper(nn.Module):
+    &#34;&#34;&#34;Wrapper around MelSpectrogram torchaudio transform providing proper padding
+    and additional post-processing including log scaling.
+
+    Args:
+        n_mels (int): Number of mel bins.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        sample_rate (int): Sample rate.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 256, win_length: tp.Optional[int] = None,
+                 n_mels: int = 80, sample_rate: float = 22050, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.n_fft = n_fft
+        hop_length = int(hop_length)
+        self.hop_length = hop_length
+        self.mel_transform = MelSpectrogram(n_mels=n_mels, sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
+                                            win_length=win_length, f_min=f_min, f_max=f_max, normalized=normalized,
+                                            window_fn=torch.hann_window, center=False)
+        self.floor_level = floor_level
+        self.log = log
+
+    def forward(self, x):
+        p = int((self.n_fft - self.hop_length) // 2)
+        if len(x.shape) == 2:
+            x = x.unsqueeze(1)
+        x = F.pad(x, (p, p), &#34;reflect&#34;)
+        # Make sure that all the frames are full.
+        # The combination of `pad_for_conv1d` and the above padding
+        # will make the output of size ceil(T / hop).
+        x = pad_for_conv1d(x, self.n_fft, self.hop_length)
+        self.mel_transform.to(x.device)
+        mel_spec = self.mel_transform(x)
+        B, C, freqs, frame = mel_spec.shape
+        if self.log:
+            mel_spec = torch.log10(self.floor_level + mel_spec)
+        return mel_spec.reshape(B, C * freqs, frame)
+
+
+class MelSpectrogramL1Loss(torch.nn.Module):
+    &#34;&#34;&#34;L1 Loss on MelSpectrogram.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024,
+                 n_mels: int = 80, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.l1 = torch.nn.L1Loss()
+        self.melspec = MelSpectrogramWrapper(n_fft=n_fft, hop_length=hop_length, win_length=win_length,
+                                             n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                             log=log, normalized=normalized, floor_level=floor_level)
+
+    def forward(self, x, y):
+        self.melspec.to(x.device)
+        s_x = self.melspec(x)
+        s_y = self.melspec(y)
+        return self.l1(s_x, s_y)
+
+
+class MultiScaleMelSpectrogramLoss(nn.Module):
+    &#34;&#34;&#34;Multi-Scale spectrogram loss (msspec).
+
+    Args:
+        sample_rate (int): Sample rate.
+        range_start (int): Power of 2 to use for the first scale.
+        range_stop (int): Power of 2 to use for the last scale.
+        n_mels (int): Number of mel bins.
+        f_min (float): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        normalized (bool): Whether to normalize the melspectrogram.
+        alphas (bool): Whether to use alphas as coefficients or not.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, range_start: int = 6, range_end: int = 11,
+                 n_mels: int = 64, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 normalized: bool = False, alphas: bool = True, floor_level: float = 1e-5):
+        super().__init__()
+        l1s = list()
+        l2s = list()
+        self.alphas = list()
+        self.total = 0
+        self.normalized = normalized
+        for i in range(range_start, range_end):
+            l1s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=False, normalized=normalized, floor_level=floor_level))
+            l2s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=True, normalized=normalized, floor_level=floor_level))
+            if alphas:
+                self.alphas.append(np.sqrt(2 ** i - 1))
+            else:
+                self.alphas.append(1)
+            self.total += self.alphas[-1] + 1
+
+        self.l1s = nn.ModuleList(l1s)
+        self.l2s = nn.ModuleList(l2s)
+
+    def forward(self, x, y):
+        loss = 0.0
+        self.l1s.to(x.device)
+        self.l2s.to(x.device)
+        for i in range(len(self.alphas)):
+            s_x_1 = self.l1s[i](x)
+            s_y_1 = self.l1s[i](y)
+            s_x_2 = self.l2s[i](x)
+            s_y_2 = self.l2s[i](y)
+            loss += F.l1_loss(s_x_1, s_y_1) + self.alphas[i] * F.mse_loss(s_x_2, s_y_2)
+        if self.normalized:
+            loss = loss / self.total
+        return loss</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.losses.specloss.MelSpectrogramL1Loss"><code class="flex name class">
+<span>class <span class="ident">MelSpectrogramL1Loss</span></span>
+<span>(</span><span>sample_rate: int, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, n_mels: int = 80, f_min: float = 0.0, f_max: Optional[float] = None, log: bool = True, normalized: bool = False, floor_level: float = 1e-05)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>L1 Loss on MelSpectrogram.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of fft.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hop size.</dd>
+<dt><strong><code>win_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Window length.</dd>
+<dt><strong><code>n_mels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel bins.</dd>
+<dt><strong><code>f_min</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Minimum frequency.</dd>
+<dt><strong><code>f_max</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Maximum frequency.</dd>
+<dt><strong><code>log</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to scale with log.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize the melspectrogram.</dd>
+<dt><strong><code>floor_level</code></strong> :&ensp;<code>float</code></dt>
+<dd>Floor level value based on human perception (default=1e-5).</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MelSpectrogramL1Loss(torch.nn.Module):
+    &#34;&#34;&#34;L1 Loss on MelSpectrogram.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024,
+                 n_mels: int = 80, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.l1 = torch.nn.L1Loss()
+        self.melspec = MelSpectrogramWrapper(n_fft=n_fft, hop_length=hop_length, win_length=win_length,
+                                             n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                             log=log, normalized=normalized, floor_level=floor_level)
+
+    def forward(self, x, y):
+        self.melspec.to(x.device)
+        s_x = self.melspec(x)
+        s_y = self.melspec(y)
+        return self.l1(s_x, s_y)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MelSpectrogramL1Loss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MelSpectrogramL1Loss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MelSpectrogramL1Loss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MelSpectrogramL1Loss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x, y) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x, y):
+    self.melspec.to(x.device)
+    s_x = self.melspec(x)
+    s_y = self.melspec(y)
+    return self.l1(s_x, s_y)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.specloss.MelSpectrogramWrapper"><code class="flex name class">
+<span>class <span class="ident">MelSpectrogramWrapper</span></span>
+<span>(</span><span>n_fft: int = 1024, hop_length: int = 256, win_length: Optional[int] = None, n_mels: int = 80, sample_rate: float = 22050, f_min: float = 0.0, f_max: Optional[float] = None, log: bool = True, normalized: bool = False, floor_level: float = 1e-05)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around MelSpectrogram torchaudio transform providing proper padding
+and additional post-processing including log scaling.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_mels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel bins.</dd>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of fft.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hop size.</dd>
+<dt><strong><code>win_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Window length.</dd>
+<dt><strong><code>n_mels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel bins.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>f_min</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Minimum frequency.</dd>
+<dt><strong><code>f_max</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Maximum frequency.</dd>
+<dt><strong><code>log</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to scale with log.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize the melspectrogram.</dd>
+<dt><strong><code>floor_level</code></strong> :&ensp;<code>float</code></dt>
+<dd>Floor level based on human perception (default=1e-5).</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MelSpectrogramWrapper(nn.Module):
+    &#34;&#34;&#34;Wrapper around MelSpectrogram torchaudio transform providing proper padding
+    and additional post-processing including log scaling.
+
+    Args:
+        n_mels (int): Number of mel bins.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        sample_rate (int): Sample rate.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 256, win_length: tp.Optional[int] = None,
+                 n_mels: int = 80, sample_rate: float = 22050, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.n_fft = n_fft
+        hop_length = int(hop_length)
+        self.hop_length = hop_length
+        self.mel_transform = MelSpectrogram(n_mels=n_mels, sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
+                                            win_length=win_length, f_min=f_min, f_max=f_max, normalized=normalized,
+                                            window_fn=torch.hann_window, center=False)
+        self.floor_level = floor_level
+        self.log = log
+
+    def forward(self, x):
+        p = int((self.n_fft - self.hop_length) // 2)
+        if len(x.shape) == 2:
+            x = x.unsqueeze(1)
+        x = F.pad(x, (p, p), &#34;reflect&#34;)
+        # Make sure that all the frames are full.
+        # The combination of `pad_for_conv1d` and the above padding
+        # will make the output of size ceil(T / hop).
+        x = pad_for_conv1d(x, self.n_fft, self.hop_length)
+        self.mel_transform.to(x.device)
+        mel_spec = self.mel_transform(x)
+        B, C, freqs, frame = mel_spec.shape
+        if self.log:
+            mel_spec = torch.log10(self.floor_level + mel_spec)
+        return mel_spec.reshape(B, C * freqs, frame)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MelSpectrogramWrapper.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MelSpectrogramWrapper.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MelSpectrogramWrapper.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MelSpectrogramWrapper.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    p = int((self.n_fft - self.hop_length) // 2)
+    if len(x.shape) == 2:
+        x = x.unsqueeze(1)
+    x = F.pad(x, (p, p), &#34;reflect&#34;)
+    # Make sure that all the frames are full.
+    # The combination of `pad_for_conv1d` and the above padding
+    # will make the output of size ceil(T / hop).
+    x = pad_for_conv1d(x, self.n_fft, self.hop_length)
+    self.mel_transform.to(x.device)
+    mel_spec = self.mel_transform(x)
+    B, C, freqs, frame = mel_spec.shape
+    if self.log:
+        mel_spec = torch.log10(self.floor_level + mel_spec)
+    return mel_spec.reshape(B, C * freqs, frame)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss"><code class="flex name class">
+<span>class <span class="ident">MultiScaleMelSpectrogramLoss</span></span>
+<span>(</span><span>sample_rate: int, range_start: int = 6, range_end: int = 11, n_mels: int = 64, f_min: float = 0.0, f_max: Optional[float] = None, normalized: bool = False, alphas: bool = True, floor_level: float = 1e-05)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Multi-Scale spectrogram loss (msspec).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>range_start</code></strong> :&ensp;<code>int</code></dt>
+<dd>Power of 2 to use for the first scale.</dd>
+<dt><strong><code>range_stop</code></strong> :&ensp;<code>int</code></dt>
+<dd>Power of 2 to use for the last scale.</dd>
+<dt><strong><code>n_mels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel bins.</dd>
+<dt><strong><code>f_min</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum frequency.</dd>
+<dt><strong><code>f_max</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Maximum frequency.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize the melspectrogram.</dd>
+<dt><strong><code>alphas</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use alphas as coefficients or not.</dd>
+<dt><strong><code>floor_level</code></strong> :&ensp;<code>float</code></dt>
+<dd>Floor level value based on human perception (default=1e-5).</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiScaleMelSpectrogramLoss(nn.Module):
+    &#34;&#34;&#34;Multi-Scale spectrogram loss (msspec).
+
+    Args:
+        sample_rate (int): Sample rate.
+        range_start (int): Power of 2 to use for the first scale.
+        range_stop (int): Power of 2 to use for the last scale.
+        n_mels (int): Number of mel bins.
+        f_min (float): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        normalized (bool): Whether to normalize the melspectrogram.
+        alphas (bool): Whether to use alphas as coefficients or not.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, range_start: int = 6, range_end: int = 11,
+                 n_mels: int = 64, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 normalized: bool = False, alphas: bool = True, floor_level: float = 1e-5):
+        super().__init__()
+        l1s = list()
+        l2s = list()
+        self.alphas = list()
+        self.total = 0
+        self.normalized = normalized
+        for i in range(range_start, range_end):
+            l1s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=False, normalized=normalized, floor_level=floor_level))
+            l2s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=True, normalized=normalized, floor_level=floor_level))
+            if alphas:
+                self.alphas.append(np.sqrt(2 ** i - 1))
+            else:
+                self.alphas.append(1)
+            self.total += self.alphas[-1] + 1
+
+        self.l1s = nn.ModuleList(l1s)
+        self.l2s = nn.ModuleList(l2s)
+
+    def forward(self, x, y):
+        loss = 0.0
+        self.l1s.to(x.device)
+        self.l2s.to(x.device)
+        for i in range(len(self.alphas)):
+            s_x_1 = self.l1s[i](x)
+            s_y_1 = self.l1s[i](y)
+            s_x_2 = self.l2s[i](x)
+            s_y_2 = self.l2s[i](y)
+            loss += F.l1_loss(s_x_1, s_y_1) + self.alphas[i] * F.mse_loss(s_x_2, s_y_2)
+        if self.normalized:
+            loss = loss / self.total
+        return loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x, y) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x, y):
+    loss = 0.0
+    self.l1s.to(x.device)
+    self.l2s.to(x.device)
+    for i in range(len(self.alphas)):
+        s_x_1 = self.l1s[i](x)
+        s_y_1 = self.l1s[i](y)
+        s_x_2 = self.l2s[i](x)
+        s_y_2 = self.l2s[i](y)
+        loss += F.l1_loss(s_x_1, s_y_1) + self.alphas[i] * F.mse_loss(s_x_2, s_y_2)
+    if self.normalized:
+        loss = loss / self.total
+    return loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.losses" href="index.html">audiocraft.losses</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.losses.specloss.MelSpectrogramL1Loss" href="#audiocraft.losses.specloss.MelSpectrogramL1Loss">MelSpectrogramL1Loss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramL1Loss.call_super_init" href="#audiocraft.losses.specloss.MelSpectrogramL1Loss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramL1Loss.dump_patches" href="#audiocraft.losses.specloss.MelSpectrogramL1Loss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramL1Loss.forward" href="#audiocraft.losses.specloss.MelSpectrogramL1Loss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramL1Loss.training" href="#audiocraft.losses.specloss.MelSpectrogramL1Loss.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.specloss.MelSpectrogramWrapper" href="#audiocraft.losses.specloss.MelSpectrogramWrapper">MelSpectrogramWrapper</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramWrapper.call_super_init" href="#audiocraft.losses.specloss.MelSpectrogramWrapper.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramWrapper.dump_patches" href="#audiocraft.losses.specloss.MelSpectrogramWrapper.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramWrapper.forward" href="#audiocraft.losses.specloss.MelSpectrogramWrapper.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MelSpectrogramWrapper.training" href="#audiocraft.losses.specloss.MelSpectrogramWrapper.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss" href="#audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss">MultiScaleMelSpectrogramLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.call_super_init" href="#audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.dump_patches" href="#audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.forward" href="#audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.training" href="#audiocraft.losses.specloss.MultiScaleMelSpectrogramLoss.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/losses/stftloss.html b/api_docs/audiocraft/losses/stftloss.html
new file mode 100644
index 00000000..2320e3fb
--- /dev/null
+++ b/api_docs/audiocraft/losses/stftloss.html
@@ -0,0 +1,890 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.losses.stftloss API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.losses.stftloss</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Adapted from MIT code under the original license
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+import typing as tp
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+# TODO: Replace with torchaudio.STFT?
+def _stft(x: torch.Tensor, fft_size: int, hop_length: int, win_length: int,
+          window: tp.Optional[torch.Tensor], normalized: bool) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x: Input signal tensor (B, C, T).
+        fft_size (int): FFT size.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        window (torch.Tensor or None): Window function type.
+        normalized (bool): Whether to normalize the STFT or not.
+
+    Returns:
+        torch.Tensor: Magnitude spectrogram (B, C, #frames, fft_size // 2 + 1).
+    &#34;&#34;&#34;
+    B, C, T = x.shape
+    x_stft = torch.stft(
+        x.view(-1, T), fft_size, hop_length, win_length, window,
+        normalized=normalized, return_complex=True,
+    )
+    x_stft = x_stft.view(B, C, *x_stft.shape[1:])
+    real = x_stft.real
+    imag = x_stft.imag
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+
+
+class SpectralConvergenceLoss(nn.Module):
+    &#34;&#34;&#34;Spectral convergence loss.
+    &#34;&#34;&#34;
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x_mag: Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag: Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+        &#34;&#34;&#34;
+        return torch.norm(y_mag - x_mag, p=&#34;fro&#34;) / (torch.norm(y_mag, p=&#34;fro&#34;) + self.epsilon)
+
+
+class LogSTFTMagnitudeLoss(nn.Module):
+    &#34;&#34;&#34;Log STFT magnitude loss.
+
+    Args:
+        epsilon (float): Epsilon value for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x_mag (torch.Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (torch.Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Log STFT magnitude loss value.
+        &#34;&#34;&#34;
+        return F.l1_loss(torch.log(self.epsilon + y_mag), torch.log(self.epsilon + x_mag))
+
+
+class STFTLosses(nn.Module):
+    &#34;&#34;&#34;STFT losses.
+
+    Args:
+        n_fft (int): Size of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = &#34;hann_window&#34;, normalized: bool = False,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.register_buffer(&#34;window&#34;, getattr(torch, window)(win_length))
+        self.spectral_convergenge_loss = SpectralConvergenceLoss(epsilon)
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss(epsilon)
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+            torch.Tensor: Log STFT magnitude loss value.
+        &#34;&#34;&#34;
+        x_mag = _stft(x, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        y_mag = _stft(y, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class STFTLoss(nn.Module):
+    &#34;&#34;&#34;Single Resolution STFT loss.
+
+    Args:
+        n_fft (int): Nb of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = &#34;hann_window&#34;, normalized: bool = False,
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.loss = STFTLosses(n_fft, hop_length, win_length, window, normalized, epsilon)
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Single resolution STFT loss.
+        &#34;&#34;&#34;
+        sc_loss, mag_loss = self.loss(x, y)
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss
+
+
+class MRSTFTLoss(nn.Module):
+    &#34;&#34;&#34;Multi resolution STFT loss.
+
+    Args:
+        n_ffts (Sequence[int]): Sequence of FFT sizes.
+        hop_lengths (Sequence[int]): Sequence of hop sizes.
+        win_lengths (Sequence[int]): Sequence of window lengths.
+        window (str): Window function type.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, n_ffts: tp.Sequence[int] = [1024, 2048, 512], hop_lengths: tp.Sequence[int] = [120, 240, 50],
+                 win_lengths: tp.Sequence[int] = [600, 1200, 240], window: str = &#34;hann_window&#34;,
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 normalized: bool = False, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(n_ffts, hop_lengths, win_lengths):
+            self.stft_losses += [STFTLosses(fs, ss, wl, window, normalized, epsilon)]
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Multi resolution STFT loss.
+        &#34;&#34;&#34;
+        sc_loss = torch.Tensor([0.0])
+        mag_loss = torch.Tensor([0.0])
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss"><code class="flex name class">
+<span>class <span class="ident">LogSTFTMagnitudeLoss</span></span>
+<span>(</span><span>epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Log STFT magnitude loss.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LogSTFTMagnitudeLoss(nn.Module):
+    &#34;&#34;&#34;Log STFT magnitude loss.
+
+    Args:
+        epsilon (float): Epsilon value for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x_mag (torch.Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (torch.Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Log STFT magnitude loss value.
+        &#34;&#34;&#34;
+        return F.l1_loss(torch.log(self.epsilon + y_mag), torch.log(self.epsilon + x_mag))</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x_mag: torch.Tensor, y_mag: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate forward propagation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x_mag</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).</dd>
+<dt><strong><code>y_mag</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Log STFT magnitude loss value.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+    &#34;&#34;&#34;Calculate forward propagation.
+
+    Args:
+        x_mag (torch.Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag (torch.Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+    Returns:
+        torch.Tensor: Log STFT magnitude loss value.
+    &#34;&#34;&#34;
+    return F.l1_loss(torch.log(self.epsilon + y_mag), torch.log(self.epsilon + x_mag))</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.stftloss.MRSTFTLoss"><code class="flex name class">
+<span>class <span class="ident">MRSTFTLoss</span></span>
+<span>(</span><span>n_ffts: Sequence[int] = [1024, 2048, 512], hop_lengths: Sequence[int] = [120, 240, 50], win_lengths: Sequence[int] = [600, 1200, 240], window: str = 'hann_window', factor_sc: float = 0.1, factor_mag: float = 0.1, normalized: bool = False, epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Multi resolution STFT loss.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_ffts</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Sequence of FFT sizes.</dd>
+<dt><strong><code>hop_lengths</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Sequence of hop sizes.</dd>
+<dt><strong><code>win_lengths</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>Sequence of window lengths.</dd>
+<dt><strong><code>window</code></strong> :&ensp;<code>str</code></dt>
+<dd>Window function type.</dd>
+<dt><strong><code>factor_sc</code></strong> :&ensp;<code>float</code></dt>
+<dd>Coefficient for the spectral loss.</dd>
+<dt><strong><code>factor_mag</code></strong> :&ensp;<code>float</code></dt>
+<dd>Coefficient for the magnitude loss.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use normalized STFT or not.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon for numerical stability.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MRSTFTLoss(nn.Module):
+    &#34;&#34;&#34;Multi resolution STFT loss.
+
+    Args:
+        n_ffts (Sequence[int]): Sequence of FFT sizes.
+        hop_lengths (Sequence[int]): Sequence of hop sizes.
+        win_lengths (Sequence[int]): Sequence of window lengths.
+        window (str): Window function type.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, n_ffts: tp.Sequence[int] = [1024, 2048, 512], hop_lengths: tp.Sequence[int] = [120, 240, 50],
+                 win_lengths: tp.Sequence[int] = [600, 1200, 240], window: str = &#34;hann_window&#34;,
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 normalized: bool = False, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(n_ffts, hop_lengths, win_lengths):
+            self.stft_losses += [STFTLosses(fs, ss, wl, window, normalized, epsilon)]
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Multi resolution STFT loss.
+        &#34;&#34;&#34;
+        sc_loss = torch.Tensor([0.0])
+        mag_loss = torch.Tensor([0.0])
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.MRSTFTLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.MRSTFTLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.MRSTFTLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.MRSTFTLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, y: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate forward propagation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Predicted signal (B, T).</dd>
+<dt><strong><code>y</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Groundtruth signal (B, T).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Multi resolution STFT loss.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Calculate forward propagation.
+
+    Args:
+        x (torch.Tensor): Predicted signal (B, T).
+        y (torch.Tensor): Groundtruth signal (B, T).
+    Returns:
+        torch.Tensor: Multi resolution STFT loss.
+    &#34;&#34;&#34;
+    sc_loss = torch.Tensor([0.0])
+    mag_loss = torch.Tensor([0.0])
+    for f in self.stft_losses:
+        sc_l, mag_l = f(x, y)
+        sc_loss += sc_l
+        mag_loss += mag_l
+    sc_loss /= len(self.stft_losses)
+    mag_loss /= len(self.stft_losses)
+
+    return self.factor_sc * sc_loss + self.factor_mag * mag_loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLoss"><code class="flex name class">
+<span>class <span class="ident">STFTLoss</span></span>
+<span>(</span><span>n_fft: int = 1024, hop_length: int = 120, win_length: int = 600, window: str = 'hann_window', normalized: bool = False, factor_sc: float = 0.1, factor_mag: float = 0.1, epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Single Resolution STFT loss.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Nb of FFT.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hop length.</dd>
+<dt><strong><code>win_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Window length.</dd>
+<dt><strong><code>window</code></strong> :&ensp;<code>str</code></dt>
+<dd>Window function type.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use normalized STFT or not.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon for numerical stability.</dd>
+<dt><strong><code>factor_sc</code></strong> :&ensp;<code>float</code></dt>
+<dd>Coefficient for the spectral loss.</dd>
+<dt><strong><code>factor_mag</code></strong> :&ensp;<code>float</code></dt>
+<dd>Coefficient for the magnitude loss.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class STFTLoss(nn.Module):
+    &#34;&#34;&#34;Single Resolution STFT loss.
+
+    Args:
+        n_fft (int): Nb of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = &#34;hann_window&#34;, normalized: bool = False,
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.loss = STFTLosses(n_fft, hop_length, win_length, window, normalized, epsilon)
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Single resolution STFT loss.
+        &#34;&#34;&#34;
+        sc_loss, mag_loss = self.loss(x, y)
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.STFTLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.STFTLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, y: torch.Tensor) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate forward propagation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Predicted signal (B, T).</dd>
+<dt><strong><code>y</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Groundtruth signal (B, T).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Single resolution STFT loss.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Calculate forward propagation.
+
+    Args:
+        x (torch.Tensor): Predicted signal (B, T).
+        y (torch.Tensor): Groundtruth signal (B, T).
+    Returns:
+        torch.Tensor: Single resolution STFT loss.
+    &#34;&#34;&#34;
+    sc_loss, mag_loss = self.loss(x, y)
+    return self.factor_sc * sc_loss + self.factor_mag * mag_loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLosses"><code class="flex name class">
+<span>class <span class="ident">STFTLosses</span></span>
+<span>(</span><span>n_fft: int = 1024, hop_length: int = 120, win_length: int = 600, window: str = 'hann_window', normalized: bool = False, epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>STFT losses.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Size of FFT.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hop length.</dd>
+<dt><strong><code>win_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Window length.</dd>
+<dt><strong><code>window</code></strong> :&ensp;<code>str</code></dt>
+<dd>Window function type.</dd>
+<dt><strong><code>normalized</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use normalized STFT or not.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon for numerical stability.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class STFTLosses(nn.Module):
+    &#34;&#34;&#34;STFT losses.
+
+    Args:
+        n_fft (int): Size of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    &#34;&#34;&#34;
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = &#34;hann_window&#34;, normalized: bool = False,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.register_buffer(&#34;window&#34;, getattr(torch, window)(win_length))
+        self.spectral_convergenge_loss = SpectralConvergenceLoss(epsilon)
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss(epsilon)
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+            torch.Tensor: Log STFT magnitude loss value.
+        &#34;&#34;&#34;
+        x_mag = _stft(x, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        y_mag = _stft(y, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.STFTLosses.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLosses.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.STFTLosses.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.STFTLosses.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, y: torch.Tensor) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate forward propagation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Predicted signal (B, T).</dd>
+<dt><strong><code>y</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Groundtruth signal (B, T).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Spectral convergence loss value.</dd>
+<dt><code>torch.Tensor</code></dt>
+<dd>Log STFT magnitude loss value.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, y: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Calculate forward propagation.
+
+    Args:
+        x (torch.Tensor): Predicted signal (B, T).
+        y (torch.Tensor): Groundtruth signal (B, T).
+    Returns:
+        torch.Tensor: Spectral convergence loss value.
+        torch.Tensor: Log STFT magnitude loss value.
+    &#34;&#34;&#34;
+    x_mag = _stft(x, self.n_fft, self.hop_length,
+                  self.win_length, self.window, self.normalized)  # type: ignore
+    y_mag = _stft(y, self.n_fft, self.hop_length,
+                  self.win_length, self.window, self.normalized)  # type: ignore
+    sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+    mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+    return sc_loss, mag_loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.losses.stftloss.SpectralConvergenceLoss"><code class="flex name class">
+<span>class <span class="ident">SpectralConvergenceLoss</span></span>
+<span>(</span><span>epsilon: float = 1.1920928955078125e-07)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Spectral convergence loss.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SpectralConvergenceLoss(nn.Module):
+    &#34;&#34;&#34;Spectral convergence loss.
+    &#34;&#34;&#34;
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        &#34;&#34;&#34;Calculate forward propagation.
+
+        Args:
+            x_mag: Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag: Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+        &#34;&#34;&#34;
+        return torch.norm(y_mag - x_mag, p=&#34;fro&#34;) / (torch.norm(y_mag, p=&#34;fro&#34;) + self.epsilon)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.SpectralConvergenceLoss.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.SpectralConvergenceLoss.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.losses.stftloss.SpectralConvergenceLoss.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.losses.stftloss.SpectralConvergenceLoss.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x_mag: torch.Tensor, y_mag: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate forward propagation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x_mag</code></strong></dt>
+<dd>Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).</dd>
+<dt><strong><code>y_mag</code></strong></dt>
+<dd>Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Spectral convergence loss value.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+    &#34;&#34;&#34;Calculate forward propagation.
+
+    Args:
+        x_mag: Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag: Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+    Returns:
+        torch.Tensor: Spectral convergence loss value.
+    &#34;&#34;&#34;
+    return torch.norm(y_mag - x_mag, p=&#34;fro&#34;) / (torch.norm(y_mag, p=&#34;fro&#34;) + self.epsilon)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.losses" href="index.html">audiocraft.losses</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss" href="#audiocraft.losses.stftloss.LogSTFTMagnitudeLoss">LogSTFTMagnitudeLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.call_super_init" href="#audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.dump_patches" href="#audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.forward" href="#audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.training" href="#audiocraft.losses.stftloss.LogSTFTMagnitudeLoss.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.stftloss.MRSTFTLoss" href="#audiocraft.losses.stftloss.MRSTFTLoss">MRSTFTLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.stftloss.MRSTFTLoss.call_super_init" href="#audiocraft.losses.stftloss.MRSTFTLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.MRSTFTLoss.dump_patches" href="#audiocraft.losses.stftloss.MRSTFTLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.MRSTFTLoss.forward" href="#audiocraft.losses.stftloss.MRSTFTLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.MRSTFTLoss.training" href="#audiocraft.losses.stftloss.MRSTFTLoss.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.stftloss.STFTLoss" href="#audiocraft.losses.stftloss.STFTLoss">STFTLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.stftloss.STFTLoss.call_super_init" href="#audiocraft.losses.stftloss.STFTLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLoss.dump_patches" href="#audiocraft.losses.stftloss.STFTLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLoss.forward" href="#audiocraft.losses.stftloss.STFTLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLoss.training" href="#audiocraft.losses.stftloss.STFTLoss.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.stftloss.STFTLosses" href="#audiocraft.losses.stftloss.STFTLosses">STFTLosses</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.stftloss.STFTLosses.call_super_init" href="#audiocraft.losses.stftloss.STFTLosses.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLosses.dump_patches" href="#audiocraft.losses.stftloss.STFTLosses.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLosses.forward" href="#audiocraft.losses.stftloss.STFTLosses.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.STFTLosses.training" href="#audiocraft.losses.stftloss.STFTLosses.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.losses.stftloss.SpectralConvergenceLoss" href="#audiocraft.losses.stftloss.SpectralConvergenceLoss">SpectralConvergenceLoss</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.losses.stftloss.SpectralConvergenceLoss.call_super_init" href="#audiocraft.losses.stftloss.SpectralConvergenceLoss.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.SpectralConvergenceLoss.dump_patches" href="#audiocraft.losses.stftloss.SpectralConvergenceLoss.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.SpectralConvergenceLoss.forward" href="#audiocraft.losses.stftloss.SpectralConvergenceLoss.forward">forward</a></code></li>
+<li><code><a title="audiocraft.losses.stftloss.SpectralConvergenceLoss.training" href="#audiocraft.losses.stftloss.SpectralConvergenceLoss.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/chroma_cosinesim.html b/api_docs/audiocraft/metrics/chroma_cosinesim.html
new file mode 100644
index 00000000..9b5cab5d
--- /dev/null
+++ b/api_docs/audiocraft/metrics/chroma_cosinesim.html
@@ -0,0 +1,330 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.chroma_cosinesim API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.chroma_cosinesim</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torchmetrics
+
+from ..data.audio_utils import convert_audio
+from ..modules.chroma import ChromaExtractor
+
+
+class ChromaCosineSimilarityMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Chroma cosine similarity metric.
+
+    This metric extracts a chromagram for a reference waveform and
+    a generated waveform and compares each frame using the cosine similarity
+    function. The output is the mean cosine similarity.
+
+    Args:
+        sample_rate (int): Sample rate used by the chroma extractor.
+        n_chroma (int): Number of chroma used by the chroma extractor.
+        radix2_exp (int): Exponent for the chroma extractor.
+        argmax (bool): Whether the chroma extractor uses argmax.
+        eps (float): Epsilon for cosine similarity computation.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int, radix2_exp: int, argmax: bool, eps: float = 1e-8):
+        super().__init__()
+        self.chroma_sample_rate = sample_rate
+        self.n_chroma = n_chroma
+        self.eps = eps
+        self.chroma_extractor = ChromaExtractor(sample_rate=self.chroma_sample_rate, n_chroma=self.n_chroma,
+                                                radix2_exp=radix2_exp, argmax=argmax)
+        self.add_state(&#34;cosine_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Compute cosine similarity between chromagrams and accumulate scores over the dataset.&#34;&#34;&#34;
+        if preds.size(0) == 0:
+            return
+
+        assert preds.shape == targets.shape, (
+            f&#34;Preds and target shapes mismatch: preds={preds.shape}, targets={targets.shape}&#34;)
+        assert preds.size(0) == sizes.size(0), (
+            f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+            f&#34;with sizes ({sizes.shape})&#34;)
+        assert preds.size(0) == sample_rates.size(0), (
+            f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+            f&#34;with sample_rates ({sample_rates.shape})&#34;)
+        assert torch.all(sample_rates == sample_rates[0].item()), &#34;All sample rates are not the same in the batch&#34;
+
+        device = self.weight.device
+        preds, targets = preds.to(device), targets.to(device)  # type: ignore
+        sample_rate = sample_rates[0].item()
+        preds = convert_audio(preds, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        targets = convert_audio(targets, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        gt_chroma = self.chroma_extractor(targets)
+        gen_chroma = self.chroma_extractor(preds)
+        chroma_lens = (sizes / self.chroma_extractor.winhop).ceil().int()
+        for i in range(len(gt_chroma)):
+            t = int(chroma_lens[i].item())
+            cosine_sim = torch.nn.functional.cosine_similarity(
+                gt_chroma[i, :t], gen_chroma[i, :t], dim=1, eps=self.eps)
+            self.cosine_sum += cosine_sim.sum(dim=0)  # type: ignore
+            self.weight += torch.tensor(t)  # type: ignore
+
+    def compute(self) -&gt; float:
+        &#34;&#34;&#34;Computes the average cosine similarty across all generated/target chromagrams pairs.&#34;&#34;&#34;
+        assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric"><code class="flex name class">
+<span>class <span class="ident">ChromaCosineSimilarityMetric</span></span>
+<span>(</span><span>sample_rate: int, n_chroma: int, radix2_exp: int, argmax: bool, eps: float = 1e-08)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Chroma cosine similarity metric.</p>
+<p>This metric extracts a chromagram for a reference waveform and
+a generated waveform and compares each frame using the cosine similarity
+function. The output is the mean cosine similarity.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate used by the chroma extractor.</dd>
+<dt><strong><code>n_chroma</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of chroma used by the chroma extractor.</dd>
+<dt><strong><code>radix2_exp</code></strong> :&ensp;<code>int</code></dt>
+<dd>Exponent for the chroma extractor.</dd>
+<dt><strong><code>argmax</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether the chroma extractor uses argmax.</dd>
+<dt><strong><code>eps</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon for cosine similarity computation.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ChromaCosineSimilarityMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Chroma cosine similarity metric.
+
+    This metric extracts a chromagram for a reference waveform and
+    a generated waveform and compares each frame using the cosine similarity
+    function. The output is the mean cosine similarity.
+
+    Args:
+        sample_rate (int): Sample rate used by the chroma extractor.
+        n_chroma (int): Number of chroma used by the chroma extractor.
+        radix2_exp (int): Exponent for the chroma extractor.
+        argmax (bool): Whether the chroma extractor uses argmax.
+        eps (float): Epsilon for cosine similarity computation.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int, radix2_exp: int, argmax: bool, eps: float = 1e-8):
+        super().__init__()
+        self.chroma_sample_rate = sample_rate
+        self.n_chroma = n_chroma
+        self.eps = eps
+        self.chroma_extractor = ChromaExtractor(sample_rate=self.chroma_sample_rate, n_chroma=self.n_chroma,
+                                                radix2_exp=radix2_exp, argmax=argmax)
+        self.add_state(&#34;cosine_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Compute cosine similarity between chromagrams and accumulate scores over the dataset.&#34;&#34;&#34;
+        if preds.size(0) == 0:
+            return
+
+        assert preds.shape == targets.shape, (
+            f&#34;Preds and target shapes mismatch: preds={preds.shape}, targets={targets.shape}&#34;)
+        assert preds.size(0) == sizes.size(0), (
+            f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+            f&#34;with sizes ({sizes.shape})&#34;)
+        assert preds.size(0) == sample_rates.size(0), (
+            f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+            f&#34;with sample_rates ({sample_rates.shape})&#34;)
+        assert torch.all(sample_rates == sample_rates[0].item()), &#34;All sample rates are not the same in the batch&#34;
+
+        device = self.weight.device
+        preds, targets = preds.to(device), targets.to(device)  # type: ignore
+        sample_rate = sample_rates[0].item()
+        preds = convert_audio(preds, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        targets = convert_audio(targets, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        gt_chroma = self.chroma_extractor(targets)
+        gen_chroma = self.chroma_extractor(preds)
+        chroma_lens = (sizes / self.chroma_extractor.winhop).ceil().int()
+        for i in range(len(gt_chroma)):
+            t = int(chroma_lens[i].item())
+            cosine_sim = torch.nn.functional.cosine_similarity(
+                gt_chroma[i, :t], gen_chroma[i, :t], dim=1, eps=self.eps)
+            self.cosine_sum += cosine_sim.sum(dim=0)  # type: ignore
+            self.weight += torch.tensor(t)  # type: ignore
+
+    def compute(self) -&gt; float:
+        &#34;&#34;&#34;Computes the average cosine similarty across all generated/target chromagrams pairs.&#34;&#34;&#34;
+        assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.compute"><code class="name flex">
+<span>def <span class="ident">compute</span></span>(<span>self) ‑> float</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Computes the average cosine similarty across all generated/target chromagrams pairs.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute(self) -&gt; float:
+    &#34;&#34;&#34;Computes the average cosine similarty across all generated/target chromagrams pairs.&#34;&#34;&#34;
+    assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+    return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, preds: torch.Tensor, targets: torch.Tensor, sizes: torch.Tensor, sample_rates: torch.Tensor) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute cosine similarity between chromagrams and accumulate scores over the dataset.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, preds: torch.Tensor, targets: torch.Tensor,
+           sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+    &#34;&#34;&#34;Compute cosine similarity between chromagrams and accumulate scores over the dataset.&#34;&#34;&#34;
+    if preds.size(0) == 0:
+        return
+
+    assert preds.shape == targets.shape, (
+        f&#34;Preds and target shapes mismatch: preds={preds.shape}, targets={targets.shape}&#34;)
+    assert preds.size(0) == sizes.size(0), (
+        f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+        f&#34;with sizes ({sizes.shape})&#34;)
+    assert preds.size(0) == sample_rates.size(0), (
+        f&#34;Number of items in preds ({preds.shape}) mismatch &#34;,
+        f&#34;with sample_rates ({sample_rates.shape})&#34;)
+    assert torch.all(sample_rates == sample_rates[0].item()), &#34;All sample rates are not the same in the batch&#34;
+
+    device = self.weight.device
+    preds, targets = preds.to(device), targets.to(device)  # type: ignore
+    sample_rate = sample_rates[0].item()
+    preds = convert_audio(preds, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+    targets = convert_audio(targets, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+    gt_chroma = self.chroma_extractor(targets)
+    gen_chroma = self.chroma_extractor(preds)
+    chroma_lens = (sizes / self.chroma_extractor.winhop).ceil().int()
+    for i in range(len(gt_chroma)):
+        t = int(chroma_lens[i].item())
+        cosine_sim = torch.nn.functional.cosine_similarity(
+            gt_chroma[i, :t], gen_chroma[i, :t], dim=1, eps=self.eps)
+        self.cosine_sum += cosine_sim.sum(dim=0)  # type: ignore
+        self.weight += torch.tensor(t)  # type: ignore</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric">ChromaCosineSimilarityMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.compute" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.full_state_update" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.higher_is_better" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.is_differentiable" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_legend_name" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_lower_bound" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_upper_bound" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.update" href="#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/clap_consistency.html b/api_docs/audiocraft/metrics/clap_consistency.html
new file mode 100644
index 00000000..5023e4ef
--- /dev/null
+++ b/api_docs/audiocraft/metrics/clap_consistency.html
@@ -0,0 +1,410 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.clap_consistency API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.clap_consistency</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import typing as tp
+
+import torch
+import torchmetrics
+from transformers import RobertaTokenizer  # type: ignore
+
+from ..data.audio_utils import convert_audio
+from ..environment import AudioCraftEnvironment
+from ..utils.utils import load_clap_state_dict
+
+try:
+    import laion_clap  # type: ignore
+except ImportError:
+    laion_clap = None
+
+
+class TextConsistencyMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Text consistency metric measuring consistency between audio and text pairs.&#34;&#34;&#34;
+
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        raise NotImplementedError(&#34;implement how to update the metric from the audio and text pairs.&#34;)
+
+    def compute(self):
+        raise NotImplementedError(&#34;implement how to compute the final metric score.&#34;)
+
+
+class CLAPTextConsistencyMetric(TextConsistencyMetric):
+    &#34;&#34;&#34;Text consistency metric relying on Contrastive Language-Audio Pretraining (CLAP).
+
+    This metric is similar to the MuLan Cycle Consistency from MusicLM (https://arxiv.org/pdf/2301.11325.pdf)
+    or the CLAP score used in Make-An-Audio (https://arxiv.org/pdf/2301.12661v1.pdf).
+
+    As a joint audio-text embedding model, a pretrained CLAP model can be used to quantify the
+    similarity between audio-text pairs. We compute the CLAP embeddings from the text descriptions as
+    well as the generated audio based on them, and define the MCC metric as the average cosine similarity
+    between these embeddings.
+
+    Model implementation &amp; pre-trained checkpoints: https://github.com/LAION-AI/CLAP
+    &#34;&#34;&#34;
+    def __init__(self, model_path: tp.Union[str, Path], model_arch: str = &#39;HTSAT-tiny&#39;, enable_fusion: bool = False):
+        super().__init__()
+        if laion_clap is None:
+            raise ImportError(&#34;Please install CLAP to compute text consistency: &#39;pip install laion_clap&#39;&#34;)
+        self.add_state(&#34;cosine_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self._initialize_model(model_path, model_arch, enable_fusion)
+
+    def _initialize_model(self, model_path: tp.Union[str, Path], model_arch: str, enable_fusion: bool):
+        model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        self.tokenize = RobertaTokenizer.from_pretrained(&#39;roberta-base&#39;)
+        self.model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
+        self.model_sample_rate = 48_000
+        load_clap_state_dict(self.model, model_path)
+        self.model.eval()
+
+    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -&gt; dict:
+        # we use the default params from CLAP module here as well
+        return self.tokenize(texts, padding=&#34;max_length&#34;, truncation=True, max_length=77, return_tensors=&#34;pt&#34;)
+
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Compute cosine similarity between audio and text pairs and accumulate scores over the dataset.&#34;&#34;&#34;
+        assert audio.size(0) == len(text), &#34;Number of audio and text samples should match&#34;
+        assert torch.all(sample_rates == sample_rates[0].item()), &#34;All items in batch should have the same sample rate&#34;
+        sample_rate = int(sample_rates[0].item())
+        # convert audio batch to 48kHz monophonic audio with no channel dimension: [B, C, T] -&gt; [B, T]
+        audio = convert_audio(audio, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1).mean(dim=1)
+        audio_embeddings = self.model.get_audio_embedding_from_data(audio, use_tensor=True)
+        text_embeddings = self.model.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+        # cosine similarity between the text and the audio embedding
+        cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_embeddings, dim=1, eps=1e-8)
+        self.cosine_sum += cosine_sim.sum(dim=0)
+        self.weight += torch.tensor(cosine_sim.size(0))
+
+    def compute(self):
+        &#34;&#34;&#34;Computes the average cosine similarty across all audio/text pairs.&#34;&#34;&#34;
+        assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric"><code class="flex name class">
+<span>class <span class="ident">CLAPTextConsistencyMetric</span></span>
+<span>(</span><span>model_path: Union[str, pathlib.Path], model_arch: str = 'HTSAT-tiny', enable_fusion: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Text consistency metric relying on Contrastive Language-Audio Pretraining (CLAP).</p>
+<p>This metric is similar to the MuLan Cycle Consistency from MusicLM (<a href="https://arxiv.org/pdf/2301.11325.pdf">https://arxiv.org/pdf/2301.11325.pdf</a>)
+or the CLAP score used in Make-An-Audio (<a href="https://arxiv.org/pdf/2301.12661v1.pdf">https://arxiv.org/pdf/2301.12661v1.pdf</a>).</p>
+<p>As a joint audio-text embedding model, a pretrained CLAP model can be used to quantify the
+similarity between audio-text pairs. We compute the CLAP embeddings from the text descriptions as
+well as the generated audio based on them, and define the MCC metric as the average cosine similarity
+between these embeddings.</p>
+<p>Model implementation &amp; pre-trained checkpoints: <a href="https://github.com/LAION-AI/CLAP">https://github.com/LAION-AI/CLAP</a></p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CLAPTextConsistencyMetric(TextConsistencyMetric):
+    &#34;&#34;&#34;Text consistency metric relying on Contrastive Language-Audio Pretraining (CLAP).
+
+    This metric is similar to the MuLan Cycle Consistency from MusicLM (https://arxiv.org/pdf/2301.11325.pdf)
+    or the CLAP score used in Make-An-Audio (https://arxiv.org/pdf/2301.12661v1.pdf).
+
+    As a joint audio-text embedding model, a pretrained CLAP model can be used to quantify the
+    similarity between audio-text pairs. We compute the CLAP embeddings from the text descriptions as
+    well as the generated audio based on them, and define the MCC metric as the average cosine similarity
+    between these embeddings.
+
+    Model implementation &amp; pre-trained checkpoints: https://github.com/LAION-AI/CLAP
+    &#34;&#34;&#34;
+    def __init__(self, model_path: tp.Union[str, Path], model_arch: str = &#39;HTSAT-tiny&#39;, enable_fusion: bool = False):
+        super().__init__()
+        if laion_clap is None:
+            raise ImportError(&#34;Please install CLAP to compute text consistency: &#39;pip install laion_clap&#39;&#34;)
+        self.add_state(&#34;cosine_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self._initialize_model(model_path, model_arch, enable_fusion)
+
+    def _initialize_model(self, model_path: tp.Union[str, Path], model_arch: str, enable_fusion: bool):
+        model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        self.tokenize = RobertaTokenizer.from_pretrained(&#39;roberta-base&#39;)
+        self.model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
+        self.model_sample_rate = 48_000
+        load_clap_state_dict(self.model, model_path)
+        self.model.eval()
+
+    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -&gt; dict:
+        # we use the default params from CLAP module here as well
+        return self.tokenize(texts, padding=&#34;max_length&#34;, truncation=True, max_length=77, return_tensors=&#34;pt&#34;)
+
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Compute cosine similarity between audio and text pairs and accumulate scores over the dataset.&#34;&#34;&#34;
+        assert audio.size(0) == len(text), &#34;Number of audio and text samples should match&#34;
+        assert torch.all(sample_rates == sample_rates[0].item()), &#34;All items in batch should have the same sample rate&#34;
+        sample_rate = int(sample_rates[0].item())
+        # convert audio batch to 48kHz monophonic audio with no channel dimension: [B, C, T] -&gt; [B, T]
+        audio = convert_audio(audio, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1).mean(dim=1)
+        audio_embeddings = self.model.get_audio_embedding_from_data(audio, use_tensor=True)
+        text_embeddings = self.model.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+        # cosine similarity between the text and the audio embedding
+        cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_embeddings, dim=1, eps=1e-8)
+        self.cosine_sum += cosine_sim.sum(dim=0)
+        self.weight += torch.tensor(cosine_sim.size(0))
+
+    def compute(self):
+        &#34;&#34;&#34;Computes the average cosine similarty across all audio/text pairs.&#34;&#34;&#34;
+        assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric">TextConsistencyMetric</a></li>
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.compute"><code class="name flex">
+<span>def <span class="ident">compute</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Computes the average cosine similarty across all audio/text pairs.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute(self):
+    &#34;&#34;&#34;Computes the average cosine similarty across all audio/text pairs.&#34;&#34;&#34;
+    assert self.weight.item() &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;  # type: ignore
+    return (self.cosine_sum / self.weight).item()  # type: ignore</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, audio: torch.Tensor, text: List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute cosine similarity between audio and text pairs and accumulate scores over the dataset.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+    &#34;&#34;&#34;Compute cosine similarity between audio and text pairs and accumulate scores over the dataset.&#34;&#34;&#34;
+    assert audio.size(0) == len(text), &#34;Number of audio and text samples should match&#34;
+    assert torch.all(sample_rates == sample_rates[0].item()), &#34;All items in batch should have the same sample rate&#34;
+    sample_rate = int(sample_rates[0].item())
+    # convert audio batch to 48kHz monophonic audio with no channel dimension: [B, C, T] -&gt; [B, T]
+    audio = convert_audio(audio, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1).mean(dim=1)
+    audio_embeddings = self.model.get_audio_embedding_from_data(audio, use_tensor=True)
+    text_embeddings = self.model.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+    # cosine similarity between the text and the audio embedding
+    cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_embeddings, dim=1, eps=1e-8)
+    self.cosine_sum += cosine_sim.sum(dim=0)
+    self.weight += torch.tensor(cosine_sim.size(0))</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric"><code class="flex name class">
+<span>class <span class="ident">TextConsistencyMetric</span></span>
+<span>(</span><span>**kwargs: Any)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Text consistency metric measuring consistency between audio and text pairs.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TextConsistencyMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Text consistency metric measuring consistency between audio and text pairs.&#34;&#34;&#34;
+
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        raise NotImplementedError(&#34;implement how to update the metric from the audio and text pairs.&#34;)
+
+    def compute(self):
+        raise NotImplementedError(&#34;implement how to compute the final metric score.&#34;)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric">CLAPTextConsistencyMetric</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.compute"><code class="name flex">
+<span>def <span class="ident">compute</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Override this method to compute the final metric value.</p>
+<p>This method will automatically synchronize state variables when running in distributed backend.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute(self):
+    raise NotImplementedError(&#34;implement how to compute the final metric score.&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.clap_consistency.TextConsistencyMetric.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, audio: torch.Tensor, text: List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Override this method to update the state variables of your metric class.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+    raise NotImplementedError(&#34;implement how to update the metric from the audio and text pairs.&#34;)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric">CLAPTextConsistencyMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.compute" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.full_state_update" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.higher_is_better" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.is_differentiable" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_legend_name" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_lower_bound" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_upper_bound" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.update" href="#audiocraft.metrics.clap_consistency.CLAPTextConsistencyMetric.update">update</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric">TextConsistencyMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.compute" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.full_state_update" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.higher_is_better" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.is_differentiable" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_legend_name" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_lower_bound" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_upper_bound" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric.update" href="#audiocraft.metrics.clap_consistency.TextConsistencyMetric.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/fad.html b/api_docs/audiocraft/metrics/fad.html
new file mode 100644
index 00000000..79c1e439
--- /dev/null
+++ b/api_docs/audiocraft/metrics/fad.html
@@ -0,0 +1,962 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.fad API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.fad</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+import os
+import subprocess
+import tempfile
+import typing as tp
+
+from audiocraft.data.audio import audio_write
+from audiocraft.data.audio_utils import convert_audio
+import flashy
+import torch
+import torchmetrics
+
+from ..environment import AudioCraftEnvironment
+
+
+logger = logging.getLogger(__name__)
+
+VGGISH_SAMPLE_RATE = 16_000
+VGGISH_CHANNELS = 1
+
+
+class FrechetAudioDistanceMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Fréchet Audio Distance computation based on official TensorFlow implementation from Google Research.
+
+    From: D.C. Dowson &amp; B.V. Landau The Fréchet distance between
+    multivariate normal distributions
+    https://doi.org/10.1016/0047-259X(82)90077-X
+    The Fréchet distance between two multivariate gaussians,
+    `X ~ N(mu_x, sigma_x)` and `Y ~ N(mu_y, sigma_y)`, is `d^2`.
+    d^2 = (mu_x - mu_y)^2 + Tr(sigma_x + sigma_y - 2 * sqrt(sigma_x*sigma_y))
+        = (mu_x - mu_y)^2 + Tr(sigma_x) + Tr(sigma_y)
+                        - 2 * Tr(sqrt(sigma_x*sigma_y)))
+
+    To use this FAD computation metric, you need to have the proper Frechet Audio Distance tool setup
+    from: https://github.com/google-research/google-research/tree/master/frechet_audio_distance
+    We provide the below instructions as reference but we do not guarantee for further support
+    in frechet_audio_distance installation. This was tested with python 3.10, cuda 11.8, tensorflow 2.12.0.
+
+        We recommend installing the frechet_audio_distance library in a dedicated env (e.g. conda).
+
+        1. Get the code and models following the repository instructions. We used the steps below:
+                git clone git@github.com:google-research/google-research.git
+                git clone git@github.com:tensorflow/models.git
+                mkdir google-research/tensorflow_models
+                touch google-research/tensorflow_models/__init__.py
+                cp -r models/research/audioset google-research/tensorflow_models/
+                touch google-research/tensorflow_models/audioset/__init__.py
+                echo &#34;from .vggish import mel_features, vggish_params, vggish_slim&#34; &gt; \
+                    google-research/tensorflow_models/audioset/__init__.py
+                # we can now remove the tensorflow models repository
+                # rm -r models
+                cd google-research
+           Follow the instructions to download the vggish checkpoint. AudioCraft base configuration
+           assumes it is placed in the AudioCraft reference dir.
+
+           Note that we operate the following changes for the code to work with TensorFlow 2.X and python 3:
+           - Update xrange for range in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/audioset_model.py
+           - Update `tf_record = tf.python_io.tf_record_iterator(filename).next()` to
+             `tf_record = tf.python_io.tf_record_iterator(filename).__next__()` in
+              https://github.com/google-research/google-research/blob/master/frechet_audio_distance/fad_utils.py
+           - Update `import vggish_params as params` to `from . import vggish_params as params` in:
+             https://github.com/tensorflow/models/blob/master/research/audioset/vggish/vggish_slim.py
+           - Add flag to provide a given batch size for running the AudioSet model in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/create_embeddings_main.py
+             ```
+             flags.DEFINE_integer(&#39;batch_size&#39;, 64,
+                                  &#39;Number of samples in the batch for AudioSet model.&#39;)
+             ```
+             Ensure you pass the flag to the create_embeddings_beam.create_pipeline function, adding:
+             `batch_size=FLAGS.batch_size` to the provided parameters.
+
+        2. Follow instructions for the library installation and a valid TensorFlow installation
+           ```
+           # e.g. instructions from: https://www.tensorflow.org/install/pip
+           conda install -c conda-forge cudatoolkit=11.8.0
+           python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.12.*
+           mkdir -p $CONDA_PREFIX/etc/conda/activate.d
+           echo &#39;CUDNN_PATH=$(dirname $(python -c &#34;import nvidia.cudnn;print(nvidia.cudnn.__file__)&#34;))&#39; \
+             &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           echo &#39;export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib&#39; \
+             &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           # Verify install: on a machine with GPU device
+           python3 -c &#34;import tensorflow as tf; print(tf.config.list_physical_devices(&#39;GPU&#39;))&#34;
+           ```
+
+           Now install frechet_audio_distance required dependencies:
+           ```
+           # We assume we already have TensorFlow installed from the above steps
+           pip install apache-beam numpy scipy tf_slim
+           ```
+
+           Finally, follow remaining library instructions to ensure you have a working frechet_audio_distance setup
+           (you may want to specify --model_ckpt flag pointing to the model&#39;s path).
+
+        3. AudioCraft&#39;s FrechetAudioDistanceMetric requires 2 environment variables pointing to the python executable
+           and Tensorflow library path from the above installation steps:
+            export TF_PYTHON_EXE=&#34;&lt;PATH_TO_THE_ENV_PYTHON_BINARY&gt;&#34;
+            export TF_LIBRARY_PATH=&#34;&lt;PATH_TO_THE_ENV_CUDNN_LIBRARY&gt;&#34;
+
+            e.g. assuming we have installed everything in a dedicated conda env
+            with python 3.10 that is currently active:
+            export TF_PYTHON_EXE=&#34;$CONDA_PREFIX/bin/python&#34;
+            export TF_LIBRARY_PATH=&#34;$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib&#34;
+
+            Finally you may want to export the following variable:
+            export TF_FORCE_GPU_ALLOW_GROWTH=true
+            See: https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
+
+            You can save those environment variables in your training conda env, when currently active:
+            `$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh`
+            e.g. assuming the env with TensorFlow and frechet_audio_distance install is named ac_eval,
+            and the training conda env is named audiocraft:
+            ```
+            # activate training env
+            conda activate audiocraft
+            # get path to all envs
+            CONDA_ENV_DIR=$(dirname $CONDA_PREFIX)
+            # export pointers to evaluation env for using TensorFlow in FrechetAudioDistanceMetric
+            touch $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo &#39;export TF_PYTHON_EXE=&#34;$CONDA_ENV_DIR/ac_eval/bin/python&#34;&#39; &gt;&gt; \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo &#39;export TF_LIBRARY_PATH=&#34;$CONDA_ENV_DIR/ac_eval/lib/python3.10/site-packages/nvidia/cudnn/lib&#34;&#39; &gt;&gt; \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # optionally:
+            echo &#39;export TF_FORCE_GPU_ALLOW_GROWTH=true&#39; &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # you may need to reactivate the audiocraft env for this to take effect
+            ```
+
+    Args:
+        bin (Path or str): Path to installed frechet audio distance code.
+        model_path (Path or str): Path to Tensorflow checkpoint for the model
+            used to compute statistics over the embedding beams.
+        format (str): Audio format used to save files.
+        log_folder (Path or str, optional): Path where to write process logs.
+    &#34;&#34;&#34;
+    def __init__(self, bin: tp.Union[Path, str], model_path: tp.Union[Path, str],
+                 format: str = &#34;wav&#34;, batch_size: tp.Optional[int] = None,
+                 log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        super().__init__()
+        self.model_sample_rate = VGGISH_SAMPLE_RATE
+        self.model_channels = VGGISH_CHANNELS
+        self.model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        assert Path(self.model_path).exists(), f&#34;Could not find provided model checkpoint path at: {self.model_path}&#34;
+        self.format = format
+        self.batch_size = batch_size
+        self.bin = bin
+        self.tf_env = {&#34;PYTHONPATH&#34;: str(self.bin)}
+        self.python_path = os.environ.get(&#39;TF_PYTHON_EXE&#39;) or &#39;python&#39;
+        logger.info(&#34;Python exe for TF is  %s&#34;, self.python_path)
+        if &#39;TF_LIBRARY_PATH&#39; in os.environ:
+            self.tf_env[&#39;LD_LIBRARY_PATH&#39;] = os.environ[&#39;TF_LIBRARY_PATH&#39;]
+        if &#39;TF_FORCE_GPU_ALLOW_GROWTH&#39; in os.environ:
+            self.tf_env[&#39;TF_FORCE_GPU_ALLOW_GROWTH&#39;] = os.environ[&#39;TF_FORCE_GPU_ALLOW_GROWTH&#39;]
+        logger.info(&#34;Env for TF is %r&#34;, self.tf_env)
+        self.reset(log_folder)
+        self.add_state(&#34;total_files&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+
+    def reset(self, log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        &#34;&#34;&#34;Reset torchmetrics.Metrics state.&#34;&#34;&#34;
+        log_folder = Path(log_folder or tempfile.mkdtemp())
+        self.tmp_dir = log_folder / &#39;fad&#39;
+        self.tmp_dir.mkdir(exist_ok=True)
+        self.samples_tests_dir = self.tmp_dir / &#39;tests&#39;
+        self.samples_tests_dir.mkdir(exist_ok=True)
+        self.samples_background_dir = self.tmp_dir / &#39;background&#39;
+        self.samples_background_dir.mkdir(exist_ok=True)
+        self.manifest_tests = self.tmp_dir / &#39;files_tests.cvs&#39;
+        self.manifest_background = self.tmp_dir / &#39;files_background.cvs&#39;
+        self.stats_tests_dir = self.tmp_dir / &#39;stats_tests&#39;
+        self.stats_background_dir = self.tmp_dir / &#39;stats_background&#39;
+        self.counter = 0
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor,
+               stems: tp.Optional[tp.List[str]] = None):
+        &#34;&#34;&#34;Update torchmetrics.Metrics by saving the audio and updating the manifest file.&#34;&#34;&#34;
+        assert preds.shape == targets.shape, f&#34;preds={preds.shape} != targets={targets.shape}&#34;
+        num_samples = preds.shape[0]
+        assert num_samples == sizes.size(0) and num_samples == sample_rates.size(0)
+        assert stems is None or num_samples == len(set(stems))
+        for i in range(num_samples):
+            self.total_files += 1  # type: ignore
+            self.counter += 1
+            wav_len = int(sizes[i].item())
+            sample_rate = int(sample_rates[i].item())
+            pred_wav = preds[i]
+            target_wav = targets[i]
+            pred_wav = pred_wav[..., :wav_len]
+            target_wav = target_wav[..., :wav_len]
+            stem_name = stems[i] if stems is not None else f&#39;sample_{self.counter}_{flashy.distrib.rank()}&#39;
+            # dump audio files
+            try:
+                pred_wav = convert_audio(
+                    pred_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_tests_dir / stem_name, pred_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy=&#34;peak&#34;)
+            except Exception as e:
+                logger.error(f&#34;Exception occured when saving tests files for FAD computation: {repr(e)} - {e}&#34;)
+            try:
+                # for the ground truth audio, we enforce the &#39;peak&#39; strategy to avoid modifying
+                # the original audio when writing it
+                target_wav = convert_audio(
+                    target_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_background_dir / stem_name, target_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy=&#34;peak&#34;)
+            except Exception as e:
+                logger.error(f&#34;Exception occured when saving background files for FAD computation: {repr(e)} - {e}&#34;)
+
+    def _get_samples_name(self, is_background: bool):
+        return &#39;background&#39; if is_background else &#39;tests&#39;
+
+    def _create_embedding_beams(self, is_background: bool, gpu_index: tp.Optional[int] = None):
+        if is_background:
+            input_samples_dir = self.samples_background_dir
+            input_filename = self.manifest_background
+            stats_name = self.stats_background_dir
+        else:
+            input_samples_dir = self.samples_tests_dir
+            input_filename = self.manifest_tests
+            stats_name = self.stats_tests_dir
+        beams_name = self._get_samples_name(is_background)
+        log_file = self.tmp_dir / f&#39;fad_logs_create_beams_{beams_name}.log&#39;
+
+        logger.info(f&#34;Scanning samples folder to fetch list of files: {input_samples_dir}&#34;)
+        with open(input_filename, &#34;w&#34;) as fout:
+            for path in Path(input_samples_dir).glob(f&#34;*.{self.format}&#34;):
+                fout.write(f&#34;{str(path)}\n&#34;)
+
+        cmd = [
+            self.python_path, &#34;-m&#34;,
+            &#34;frechet_audio_distance.create_embeddings_main&#34;,
+            &#34;--model_ckpt&#34;, f&#34;{self.model_path}&#34;,
+            &#34;--input_files&#34;, f&#34;{str(input_filename)}&#34;,
+            &#34;--stats&#34;, f&#34;{str(stats_name)}&#34;,
+        ]
+        if self.batch_size is not None:
+            cmd += [&#34;--batch_size&#34;, str(self.batch_size)]
+        logger.info(f&#34;Launching frechet_audio_distance embeddings main method: {&#39; &#39;.join(cmd)} on {beams_name}&#34;)
+        env = os.environ
+        if gpu_index is not None:
+            env[&#34;CUDA_VISIBLE_DEVICES&#34;] = str(gpu_index)
+        process = subprocess.Popen(
+            cmd, stdout=open(log_file, &#34;w&#34;), env={**env, **self.tf_env}, stderr=subprocess.STDOUT)
+        return process, log_file
+
+    def _compute_fad_score(self, gpu_index: tp.Optional[int] = None):
+        cmd = [
+            self.python_path, &#34;-m&#34;, &#34;frechet_audio_distance.compute_fad&#34;,
+            &#34;--test_stats&#34;, f&#34;{str(self.stats_tests_dir)}&#34;,
+            &#34;--background_stats&#34;, f&#34;{str(self.stats_background_dir)}&#34;,
+        ]
+        logger.info(f&#34;Launching frechet_audio_distance compute fad method: {&#39; &#39;.join(cmd)}&#34;)
+        env = os.environ
+        if gpu_index is not None:
+            env[&#34;CUDA_VISIBLE_DEVICES&#34;] = str(gpu_index)
+        result = subprocess.run(cmd, env={**env, **self.tf_env}, capture_output=True)
+        if result.returncode:
+            logger.error(
+                &#34;Error with FAD computation from stats: \n %s \n %s&#34;,
+                result.stdout.decode(), result.stderr.decode()
+            )
+            raise RuntimeError(&#34;Error while executing FAD computation from stats&#34;)
+        try:
+            # result is &#34;FAD: (d+).(d+)&#34; hence we remove the prefix with (d+) being one digit or more
+            fad_score = float(result.stdout[4:])
+            return fad_score
+        except Exception as e:
+            raise RuntimeError(f&#34;Error parsing FAD score from command stdout: {e}&#34;)
+
+    def _log_process_result(self, returncode: int, log_file: tp.Union[Path, str], is_background: bool) -&gt; None:
+        beams_name = self._get_samples_name(is_background)
+        if returncode:
+            with open(log_file, &#34;r&#34;) as f:
+                error_log = f.read()
+                logger.error(error_log)
+            os._exit(1)
+        else:
+            logger.info(f&#34;Successfully computed embedding beams on {beams_name} samples.&#34;)
+
+    def _parallel_create_embedding_beams(self, num_of_gpus: int):
+        assert num_of_gpus &gt; 0
+        logger.info(&#34;Creating embeddings beams in a parallel manner on different GPUs&#34;)
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False, gpu_index=0)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True, gpu_index=1)
+        tests_beams_code = tests_beams_process.wait()
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+
+    def _sequential_create_embedding_beams(self):
+        logger.info(&#34;Creating embeddings beams in a sequential manner&#34;)
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False)
+        tests_beams_code = tests_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True)
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+
+    @flashy.distrib.rank_zero_only
+    def _local_compute_frechet_audio_distance(self):
+        &#34;&#34;&#34;Compute Frechet Audio Distance score calling TensorFlow API.&#34;&#34;&#34;
+        num_of_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        if num_of_gpus &gt; 1:
+            self._parallel_create_embedding_beams(num_of_gpus)
+        else:
+            self._sequential_create_embedding_beams()
+        fad_score = self._compute_fad_score(gpu_index=0)
+        return fad_score
+
+    def compute(self) -&gt; float:
+        &#34;&#34;&#34;Compute metrics.&#34;&#34;&#34;
+        assert self.total_files.item() &gt; 0, &#34;No files dumped for FAD computation!&#34;  # type: ignore
+        fad_score = self._local_compute_frechet_audio_distance()
+        logger.warning(f&#34;FAD score = {fad_score}&#34;)
+        fad_score = flashy.distrib.broadcast_object(fad_score, src=0)
+        return fad_score</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric"><code class="flex name class">
+<span>class <span class="ident">FrechetAudioDistanceMetric</span></span>
+<span>(</span><span>bin: Union[str, pathlib.Path], model_path: Union[str, pathlib.Path], format: str = 'wav', batch_size: Optional[int] = None, log_folder: Union[pathlib.Path, str, None] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Fréchet Audio Distance computation based on official TensorFlow implementation from Google Research.</p>
+<p>From: D.C. Dowson &amp; B.V. Landau The Fréchet distance between
+multivariate normal distributions
+<a href="https://doi.org/10.1016/0047-259X(82)90077-X">https://doi.org/10.1016/0047-259X(82)90077-X</a>
+The Fréchet distance between two multivariate gaussians,
+<code>X ~ N(mu_x, sigma_x)</code> and <code>Y ~ N(mu_y, sigma_y)</code>, is <code>d^2</code>.
+d^2 = (mu_x - mu_y)^2 + Tr(sigma_x + sigma_y - 2 * sqrt(sigma_x<em>sigma_y))
+= (mu_x - mu_y)^2 + Tr(sigma_x) + Tr(sigma_y)
+- 2 * Tr(sqrt(sigma_x</em>sigma_y)))</p>
+<p>To use this FAD computation metric, you need to have the proper Frechet Audio Distance tool setup
+from: <a href="https://github.com/google-research/google-research/tree/master/frechet_audio_distance">https://github.com/google-research/google-research/tree/master/frechet_audio_distance</a>
+We provide the below instructions as reference but we do not guarantee for further support
+in frechet_audio_distance installation. This was tested with python 3.10, cuda 11.8, tensorflow 2.12.0.</p>
+<pre><code>We recommend installing the frechet_audio_distance library in a dedicated env (e.g. conda).
+
+1. Get the code and models following the repository instructions. We used the steps below:
+        git clone git@github.com:google-research/google-research.git
+        git clone git@github.com:tensorflow/models.git
+        mkdir google-research/tensorflow_models
+        touch google-research/tensorflow_models/__init__.py
+        cp -r models/research/audioset google-research/tensorflow_models/
+        touch google-research/tensorflow_models/audioset/__init__.py
+        echo "from .vggish import mel_features, vggish_params, vggish_slim" &gt;                     google-research/tensorflow_models/audioset/__init__.py
+        # we can now remove the tensorflow models repository
+        # rm -r models
+        cd google-research
+   Follow the instructions to download the vggish checkpoint. AudioCraft base configuration
+   assumes it is placed in the AudioCraft reference dir.
+
+   Note that we operate the following changes for the code to work with TensorFlow 2.X and python 3:
+   - Update xrange for range in:
+     &lt;https://github.com/google-research/google-research/blob/master/frechet_audio_distance/audioset_model.py&gt;
+   - Update `tf_record = tf.python_io.tf_record_iterator(filename).next()` to
+     `tf_record = tf.python_io.tf_record_iterator(filename).__next__()` in
+      &lt;https://github.com/google-research/google-research/blob/master/frechet_audio_distance/fad_utils.py&gt;
+   - Update &lt;code&gt;import vggish\_params as params&lt;/code&gt; to &lt;code&gt;from . import vggish\_params as params&lt;/code&gt; in:
+     &lt;https://github.com/tensorflow/models/blob/master/research/audioset/vggish/vggish_slim.py&gt;
+   - Add flag to provide a given batch size for running the AudioSet model in:
+     &lt;https://github.com/google-research/google-research/blob/master/frechet_audio_distance/create_embeddings_main.py&gt;
+     ```
+     flags.DEFINE_integer('batch_size', 64,
+                          'Number of samples in the batch for AudioSet model.')
+     ```
+     Ensure you pass the flag to the create_embeddings_beam.create_pipeline function, adding:
+     `batch_size=FLAGS.batch_size` to the provided parameters.
+
+2. Follow instructions for the library installation and a valid TensorFlow installation
+   ```
+   # e.g. instructions from: &lt;https://www.tensorflow.org/install/pip&gt;
+   conda install -c conda-forge cudatoolkit=11.8.0
+   python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.12.*
+   mkdir -p $CONDA_PREFIX/etc/conda/activate.d
+   echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))'              &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+   echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib'              &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+   source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+   # Verify install: on a machine with GPU device
+   python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
+   ```
+
+   Now install frechet_audio_distance required dependencies:
+   ```
+   # We assume we already have TensorFlow installed from the above steps
+   pip install apache-beam numpy scipy tf_slim
+   ```
+
+   Finally, follow remaining library instructions to ensure you have a working frechet_audio_distance setup
+   (you may want to specify --model_ckpt flag pointing to the model's path).
+
+3. AudioCraft's FrechetAudioDistanceMetric requires 2 environment variables pointing to the python executable
+   and Tensorflow library path from the above installation steps:
+    export TF_PYTHON_EXE="&lt;PATH_TO_THE_ENV_PYTHON_BINARY&gt;"
+    export TF_LIBRARY_PATH="&lt;PATH_TO_THE_ENV_CUDNN_LIBRARY&gt;"
+
+    e.g. assuming we have installed everything in a dedicated conda env
+    with python 3.10 that is currently active:
+    export TF_PYTHON_EXE="$CONDA_PREFIX/bin/python"
+    export TF_LIBRARY_PATH="$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib"
+
+    Finally you may want to export the following variable:
+    export TF_FORCE_GPU_ALLOW_GROWTH=true
+    See: &lt;https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth&gt;
+
+    You can save those environment variables in your training conda env, when currently active:
+    `$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh`
+    e.g. assuming the env with TensorFlow and frechet_audio_distance install is named ac_eval,
+    and the training conda env is named audiocraft:
+    ```
+    # activate training env
+    conda activate audiocraft
+    # get path to all envs
+    CONDA_ENV_DIR=$(dirname $CONDA_PREFIX)
+    # export pointers to evaluation env for using TensorFlow in FrechetAudioDistanceMetric
+    touch $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+    echo 'export TF_PYTHON_EXE="$CONDA_ENV_DIR/ac_eval/bin/python"' &gt;&gt;                 $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+    echo 'export TF_LIBRARY_PATH="$CONDA_ENV_DIR/ac_eval/lib/python3.10/site-packages/nvidia/cudnn/lib"' &gt;&gt;                 $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+    # optionally:
+    echo 'export TF_FORCE_GPU_ALLOW_GROWTH=true' &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+    # you may need to reactivate the audiocraft env for this to take effect
+    ```
+</code></pre>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>bin</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>Path to installed frechet audio distance code.</dd>
+<dt><strong><code>model_path</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>Path to Tensorflow checkpoint for the model
+used to compute statistics over the embedding beams.</dd>
+<dt><strong><code>format</code></strong> :&ensp;<code>str</code></dt>
+<dd>Audio format used to save files.</dd>
+<dt><strong><code>log_folder</code></strong> :&ensp;<code>Path</code> or <code>str</code>, optional</dt>
+<dd>Path where to write process logs.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class FrechetAudioDistanceMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Fréchet Audio Distance computation based on official TensorFlow implementation from Google Research.
+
+    From: D.C. Dowson &amp; B.V. Landau The Fréchet distance between
+    multivariate normal distributions
+    https://doi.org/10.1016/0047-259X(82)90077-X
+    The Fréchet distance between two multivariate gaussians,
+    `X ~ N(mu_x, sigma_x)` and `Y ~ N(mu_y, sigma_y)`, is `d^2`.
+    d^2 = (mu_x - mu_y)^2 + Tr(sigma_x + sigma_y - 2 * sqrt(sigma_x*sigma_y))
+        = (mu_x - mu_y)^2 + Tr(sigma_x) + Tr(sigma_y)
+                        - 2 * Tr(sqrt(sigma_x*sigma_y)))
+
+    To use this FAD computation metric, you need to have the proper Frechet Audio Distance tool setup
+    from: https://github.com/google-research/google-research/tree/master/frechet_audio_distance
+    We provide the below instructions as reference but we do not guarantee for further support
+    in frechet_audio_distance installation. This was tested with python 3.10, cuda 11.8, tensorflow 2.12.0.
+
+        We recommend installing the frechet_audio_distance library in a dedicated env (e.g. conda).
+
+        1. Get the code and models following the repository instructions. We used the steps below:
+                git clone git@github.com:google-research/google-research.git
+                git clone git@github.com:tensorflow/models.git
+                mkdir google-research/tensorflow_models
+                touch google-research/tensorflow_models/__init__.py
+                cp -r models/research/audioset google-research/tensorflow_models/
+                touch google-research/tensorflow_models/audioset/__init__.py
+                echo &#34;from .vggish import mel_features, vggish_params, vggish_slim&#34; &gt; \
+                    google-research/tensorflow_models/audioset/__init__.py
+                # we can now remove the tensorflow models repository
+                # rm -r models
+                cd google-research
+           Follow the instructions to download the vggish checkpoint. AudioCraft base configuration
+           assumes it is placed in the AudioCraft reference dir.
+
+           Note that we operate the following changes for the code to work with TensorFlow 2.X and python 3:
+           - Update xrange for range in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/audioset_model.py
+           - Update `tf_record = tf.python_io.tf_record_iterator(filename).next()` to
+             `tf_record = tf.python_io.tf_record_iterator(filename).__next__()` in
+              https://github.com/google-research/google-research/blob/master/frechet_audio_distance/fad_utils.py
+           - Update `import vggish_params as params` to `from . import vggish_params as params` in:
+             https://github.com/tensorflow/models/blob/master/research/audioset/vggish/vggish_slim.py
+           - Add flag to provide a given batch size for running the AudioSet model in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/create_embeddings_main.py
+             ```
+             flags.DEFINE_integer(&#39;batch_size&#39;, 64,
+                                  &#39;Number of samples in the batch for AudioSet model.&#39;)
+             ```
+             Ensure you pass the flag to the create_embeddings_beam.create_pipeline function, adding:
+             `batch_size=FLAGS.batch_size` to the provided parameters.
+
+        2. Follow instructions for the library installation and a valid TensorFlow installation
+           ```
+           # e.g. instructions from: https://www.tensorflow.org/install/pip
+           conda install -c conda-forge cudatoolkit=11.8.0
+           python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.12.*
+           mkdir -p $CONDA_PREFIX/etc/conda/activate.d
+           echo &#39;CUDNN_PATH=$(dirname $(python -c &#34;import nvidia.cudnn;print(nvidia.cudnn.__file__)&#34;))&#39; \
+             &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           echo &#39;export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib&#39; \
+             &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           # Verify install: on a machine with GPU device
+           python3 -c &#34;import tensorflow as tf; print(tf.config.list_physical_devices(&#39;GPU&#39;))&#34;
+           ```
+
+           Now install frechet_audio_distance required dependencies:
+           ```
+           # We assume we already have TensorFlow installed from the above steps
+           pip install apache-beam numpy scipy tf_slim
+           ```
+
+           Finally, follow remaining library instructions to ensure you have a working frechet_audio_distance setup
+           (you may want to specify --model_ckpt flag pointing to the model&#39;s path).
+
+        3. AudioCraft&#39;s FrechetAudioDistanceMetric requires 2 environment variables pointing to the python executable
+           and Tensorflow library path from the above installation steps:
+            export TF_PYTHON_EXE=&#34;&lt;PATH_TO_THE_ENV_PYTHON_BINARY&gt;&#34;
+            export TF_LIBRARY_PATH=&#34;&lt;PATH_TO_THE_ENV_CUDNN_LIBRARY&gt;&#34;
+
+            e.g. assuming we have installed everything in a dedicated conda env
+            with python 3.10 that is currently active:
+            export TF_PYTHON_EXE=&#34;$CONDA_PREFIX/bin/python&#34;
+            export TF_LIBRARY_PATH=&#34;$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib&#34;
+
+            Finally you may want to export the following variable:
+            export TF_FORCE_GPU_ALLOW_GROWTH=true
+            See: https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
+
+            You can save those environment variables in your training conda env, when currently active:
+            `$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh`
+            e.g. assuming the env with TensorFlow and frechet_audio_distance install is named ac_eval,
+            and the training conda env is named audiocraft:
+            ```
+            # activate training env
+            conda activate audiocraft
+            # get path to all envs
+            CONDA_ENV_DIR=$(dirname $CONDA_PREFIX)
+            # export pointers to evaluation env for using TensorFlow in FrechetAudioDistanceMetric
+            touch $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo &#39;export TF_PYTHON_EXE=&#34;$CONDA_ENV_DIR/ac_eval/bin/python&#34;&#39; &gt;&gt; \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo &#39;export TF_LIBRARY_PATH=&#34;$CONDA_ENV_DIR/ac_eval/lib/python3.10/site-packages/nvidia/cudnn/lib&#34;&#39; &gt;&gt; \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # optionally:
+            echo &#39;export TF_FORCE_GPU_ALLOW_GROWTH=true&#39; &gt;&gt; $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # you may need to reactivate the audiocraft env for this to take effect
+            ```
+
+    Args:
+        bin (Path or str): Path to installed frechet audio distance code.
+        model_path (Path or str): Path to Tensorflow checkpoint for the model
+            used to compute statistics over the embedding beams.
+        format (str): Audio format used to save files.
+        log_folder (Path or str, optional): Path where to write process logs.
+    &#34;&#34;&#34;
+    def __init__(self, bin: tp.Union[Path, str], model_path: tp.Union[Path, str],
+                 format: str = &#34;wav&#34;, batch_size: tp.Optional[int] = None,
+                 log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        super().__init__()
+        self.model_sample_rate = VGGISH_SAMPLE_RATE
+        self.model_channels = VGGISH_CHANNELS
+        self.model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        assert Path(self.model_path).exists(), f&#34;Could not find provided model checkpoint path at: {self.model_path}&#34;
+        self.format = format
+        self.batch_size = batch_size
+        self.bin = bin
+        self.tf_env = {&#34;PYTHONPATH&#34;: str(self.bin)}
+        self.python_path = os.environ.get(&#39;TF_PYTHON_EXE&#39;) or &#39;python&#39;
+        logger.info(&#34;Python exe for TF is  %s&#34;, self.python_path)
+        if &#39;TF_LIBRARY_PATH&#39; in os.environ:
+            self.tf_env[&#39;LD_LIBRARY_PATH&#39;] = os.environ[&#39;TF_LIBRARY_PATH&#39;]
+        if &#39;TF_FORCE_GPU_ALLOW_GROWTH&#39; in os.environ:
+            self.tf_env[&#39;TF_FORCE_GPU_ALLOW_GROWTH&#39;] = os.environ[&#39;TF_FORCE_GPU_ALLOW_GROWTH&#39;]
+        logger.info(&#34;Env for TF is %r&#34;, self.tf_env)
+        self.reset(log_folder)
+        self.add_state(&#34;total_files&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+
+    def reset(self, log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        &#34;&#34;&#34;Reset torchmetrics.Metrics state.&#34;&#34;&#34;
+        log_folder = Path(log_folder or tempfile.mkdtemp())
+        self.tmp_dir = log_folder / &#39;fad&#39;
+        self.tmp_dir.mkdir(exist_ok=True)
+        self.samples_tests_dir = self.tmp_dir / &#39;tests&#39;
+        self.samples_tests_dir.mkdir(exist_ok=True)
+        self.samples_background_dir = self.tmp_dir / &#39;background&#39;
+        self.samples_background_dir.mkdir(exist_ok=True)
+        self.manifest_tests = self.tmp_dir / &#39;files_tests.cvs&#39;
+        self.manifest_background = self.tmp_dir / &#39;files_background.cvs&#39;
+        self.stats_tests_dir = self.tmp_dir / &#39;stats_tests&#39;
+        self.stats_background_dir = self.tmp_dir / &#39;stats_background&#39;
+        self.counter = 0
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor,
+               stems: tp.Optional[tp.List[str]] = None):
+        &#34;&#34;&#34;Update torchmetrics.Metrics by saving the audio and updating the manifest file.&#34;&#34;&#34;
+        assert preds.shape == targets.shape, f&#34;preds={preds.shape} != targets={targets.shape}&#34;
+        num_samples = preds.shape[0]
+        assert num_samples == sizes.size(0) and num_samples == sample_rates.size(0)
+        assert stems is None or num_samples == len(set(stems))
+        for i in range(num_samples):
+            self.total_files += 1  # type: ignore
+            self.counter += 1
+            wav_len = int(sizes[i].item())
+            sample_rate = int(sample_rates[i].item())
+            pred_wav = preds[i]
+            target_wav = targets[i]
+            pred_wav = pred_wav[..., :wav_len]
+            target_wav = target_wav[..., :wav_len]
+            stem_name = stems[i] if stems is not None else f&#39;sample_{self.counter}_{flashy.distrib.rank()}&#39;
+            # dump audio files
+            try:
+                pred_wav = convert_audio(
+                    pred_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_tests_dir / stem_name, pred_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy=&#34;peak&#34;)
+            except Exception as e:
+                logger.error(f&#34;Exception occured when saving tests files for FAD computation: {repr(e)} - {e}&#34;)
+            try:
+                # for the ground truth audio, we enforce the &#39;peak&#39; strategy to avoid modifying
+                # the original audio when writing it
+                target_wav = convert_audio(
+                    target_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_background_dir / stem_name, target_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy=&#34;peak&#34;)
+            except Exception as e:
+                logger.error(f&#34;Exception occured when saving background files for FAD computation: {repr(e)} - {e}&#34;)
+
+    def _get_samples_name(self, is_background: bool):
+        return &#39;background&#39; if is_background else &#39;tests&#39;
+
+    def _create_embedding_beams(self, is_background: bool, gpu_index: tp.Optional[int] = None):
+        if is_background:
+            input_samples_dir = self.samples_background_dir
+            input_filename = self.manifest_background
+            stats_name = self.stats_background_dir
+        else:
+            input_samples_dir = self.samples_tests_dir
+            input_filename = self.manifest_tests
+            stats_name = self.stats_tests_dir
+        beams_name = self._get_samples_name(is_background)
+        log_file = self.tmp_dir / f&#39;fad_logs_create_beams_{beams_name}.log&#39;
+
+        logger.info(f&#34;Scanning samples folder to fetch list of files: {input_samples_dir}&#34;)
+        with open(input_filename, &#34;w&#34;) as fout:
+            for path in Path(input_samples_dir).glob(f&#34;*.{self.format}&#34;):
+                fout.write(f&#34;{str(path)}\n&#34;)
+
+        cmd = [
+            self.python_path, &#34;-m&#34;,
+            &#34;frechet_audio_distance.create_embeddings_main&#34;,
+            &#34;--model_ckpt&#34;, f&#34;{self.model_path}&#34;,
+            &#34;--input_files&#34;, f&#34;{str(input_filename)}&#34;,
+            &#34;--stats&#34;, f&#34;{str(stats_name)}&#34;,
+        ]
+        if self.batch_size is not None:
+            cmd += [&#34;--batch_size&#34;, str(self.batch_size)]
+        logger.info(f&#34;Launching frechet_audio_distance embeddings main method: {&#39; &#39;.join(cmd)} on {beams_name}&#34;)
+        env = os.environ
+        if gpu_index is not None:
+            env[&#34;CUDA_VISIBLE_DEVICES&#34;] = str(gpu_index)
+        process = subprocess.Popen(
+            cmd, stdout=open(log_file, &#34;w&#34;), env={**env, **self.tf_env}, stderr=subprocess.STDOUT)
+        return process, log_file
+
+    def _compute_fad_score(self, gpu_index: tp.Optional[int] = None):
+        cmd = [
+            self.python_path, &#34;-m&#34;, &#34;frechet_audio_distance.compute_fad&#34;,
+            &#34;--test_stats&#34;, f&#34;{str(self.stats_tests_dir)}&#34;,
+            &#34;--background_stats&#34;, f&#34;{str(self.stats_background_dir)}&#34;,
+        ]
+        logger.info(f&#34;Launching frechet_audio_distance compute fad method: {&#39; &#39;.join(cmd)}&#34;)
+        env = os.environ
+        if gpu_index is not None:
+            env[&#34;CUDA_VISIBLE_DEVICES&#34;] = str(gpu_index)
+        result = subprocess.run(cmd, env={**env, **self.tf_env}, capture_output=True)
+        if result.returncode:
+            logger.error(
+                &#34;Error with FAD computation from stats: \n %s \n %s&#34;,
+                result.stdout.decode(), result.stderr.decode()
+            )
+            raise RuntimeError(&#34;Error while executing FAD computation from stats&#34;)
+        try:
+            # result is &#34;FAD: (d+).(d+)&#34; hence we remove the prefix with (d+) being one digit or more
+            fad_score = float(result.stdout[4:])
+            return fad_score
+        except Exception as e:
+            raise RuntimeError(f&#34;Error parsing FAD score from command stdout: {e}&#34;)
+
+    def _log_process_result(self, returncode: int, log_file: tp.Union[Path, str], is_background: bool) -&gt; None:
+        beams_name = self._get_samples_name(is_background)
+        if returncode:
+            with open(log_file, &#34;r&#34;) as f:
+                error_log = f.read()
+                logger.error(error_log)
+            os._exit(1)
+        else:
+            logger.info(f&#34;Successfully computed embedding beams on {beams_name} samples.&#34;)
+
+    def _parallel_create_embedding_beams(self, num_of_gpus: int):
+        assert num_of_gpus &gt; 0
+        logger.info(&#34;Creating embeddings beams in a parallel manner on different GPUs&#34;)
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False, gpu_index=0)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True, gpu_index=1)
+        tests_beams_code = tests_beams_process.wait()
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+
+    def _sequential_create_embedding_beams(self):
+        logger.info(&#34;Creating embeddings beams in a sequential manner&#34;)
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False)
+        tests_beams_code = tests_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True)
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+
+    @flashy.distrib.rank_zero_only
+    def _local_compute_frechet_audio_distance(self):
+        &#34;&#34;&#34;Compute Frechet Audio Distance score calling TensorFlow API.&#34;&#34;&#34;
+        num_of_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        if num_of_gpus &gt; 1:
+            self._parallel_create_embedding_beams(num_of_gpus)
+        else:
+            self._sequential_create_embedding_beams()
+        fad_score = self._compute_fad_score(gpu_index=0)
+        return fad_score
+
+    def compute(self) -&gt; float:
+        &#34;&#34;&#34;Compute metrics.&#34;&#34;&#34;
+        assert self.total_files.item() &gt; 0, &#34;No files dumped for FAD computation!&#34;  # type: ignore
+        fad_score = self._local_compute_frechet_audio_distance()
+        logger.warning(f&#34;FAD score = {fad_score}&#34;)
+        fad_score = flashy.distrib.broadcast_object(fad_score, src=0)
+        return fad_score</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.compute"><code class="name flex">
+<span>def <span class="ident">compute</span></span>(<span>self) ‑> float</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute metrics.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute(self) -&gt; float:
+    &#34;&#34;&#34;Compute metrics.&#34;&#34;&#34;
+    assert self.total_files.item() &gt; 0, &#34;No files dumped for FAD computation!&#34;  # type: ignore
+    fad_score = self._local_compute_frechet_audio_distance()
+    logger.warning(f&#34;FAD score = {fad_score}&#34;)
+    fad_score = flashy.distrib.broadcast_object(fad_score, src=0)
+    return fad_score</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.reset"><code class="name flex">
+<span>def <span class="ident">reset</span></span>(<span>self, log_folder: Union[pathlib.Path, str, None] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Reset torchmetrics.Metrics state.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def reset(self, log_folder: tp.Optional[tp.Union[Path, str]] = None):
+    &#34;&#34;&#34;Reset torchmetrics.Metrics state.&#34;&#34;&#34;
+    log_folder = Path(log_folder or tempfile.mkdtemp())
+    self.tmp_dir = log_folder / &#39;fad&#39;
+    self.tmp_dir.mkdir(exist_ok=True)
+    self.samples_tests_dir = self.tmp_dir / &#39;tests&#39;
+    self.samples_tests_dir.mkdir(exist_ok=True)
+    self.samples_background_dir = self.tmp_dir / &#39;background&#39;
+    self.samples_background_dir.mkdir(exist_ok=True)
+    self.manifest_tests = self.tmp_dir / &#39;files_tests.cvs&#39;
+    self.manifest_background = self.tmp_dir / &#39;files_background.cvs&#39;
+    self.stats_tests_dir = self.tmp_dir / &#39;stats_tests&#39;
+    self.stats_background_dir = self.tmp_dir / &#39;stats_background&#39;
+    self.counter = 0</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.fad.FrechetAudioDistanceMetric.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, preds: torch.Tensor, targets: torch.Tensor, sizes: torch.Tensor, sample_rates: torch.Tensor, stems: Optional[List[str]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Update torchmetrics.Metrics by saving the audio and updating the manifest file.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, preds: torch.Tensor, targets: torch.Tensor,
+           sizes: torch.Tensor, sample_rates: torch.Tensor,
+           stems: tp.Optional[tp.List[str]] = None):
+    &#34;&#34;&#34;Update torchmetrics.Metrics by saving the audio and updating the manifest file.&#34;&#34;&#34;
+    assert preds.shape == targets.shape, f&#34;preds={preds.shape} != targets={targets.shape}&#34;
+    num_samples = preds.shape[0]
+    assert num_samples == sizes.size(0) and num_samples == sample_rates.size(0)
+    assert stems is None or num_samples == len(set(stems))
+    for i in range(num_samples):
+        self.total_files += 1  # type: ignore
+        self.counter += 1
+        wav_len = int(sizes[i].item())
+        sample_rate = int(sample_rates[i].item())
+        pred_wav = preds[i]
+        target_wav = targets[i]
+        pred_wav = pred_wav[..., :wav_len]
+        target_wav = target_wav[..., :wav_len]
+        stem_name = stems[i] if stems is not None else f&#39;sample_{self.counter}_{flashy.distrib.rank()}&#39;
+        # dump audio files
+        try:
+            pred_wav = convert_audio(
+                pred_wav.unsqueeze(0), from_rate=sample_rate,
+                to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+            audio_write(
+                self.samples_tests_dir / stem_name, pred_wav, sample_rate=self.model_sample_rate,
+                format=self.format, strategy=&#34;peak&#34;)
+        except Exception as e:
+            logger.error(f&#34;Exception occured when saving tests files for FAD computation: {repr(e)} - {e}&#34;)
+        try:
+            # for the ground truth audio, we enforce the &#39;peak&#39; strategy to avoid modifying
+            # the original audio when writing it
+            target_wav = convert_audio(
+                target_wav.unsqueeze(0), from_rate=sample_rate,
+                to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+            audio_write(
+                self.samples_background_dir / stem_name, target_wav, sample_rate=self.model_sample_rate,
+                format=self.format, strategy=&#34;peak&#34;)
+        except Exception as e:
+            logger.error(f&#34;Exception occured when saving background files for FAD computation: {repr(e)} - {e}&#34;)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric">FrechetAudioDistanceMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.compute" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.full_state_update" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.higher_is_better" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.is_differentiable" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_legend_name" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_lower_bound" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_upper_bound" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.reset" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.reset">reset</a></code></li>
+<li><code><a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric.update" href="#audiocraft.metrics.fad.FrechetAudioDistanceMetric.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/index.html b/api_docs/audiocraft/metrics/index.html
new file mode 100644
index 00000000..2bc5d486
--- /dev/null
+++ b/api_docs/audiocraft/metrics/index.html
@@ -0,0 +1,110 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics API documentation</title>
+<meta name="description" content="Metrics like CLAP score, FAD, KLD, Visqol, Chroma similarity, etc." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics</code></h1>
+</header>
+<section id="section-intro">
+<p>Metrics like CLAP score, FAD, KLD, Visqol, Chroma similarity, etc.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Metrics like CLAP score, FAD, KLD, Visqol, Chroma similarity, etc.
+&#34;&#34;&#34;
+# flake8: noqa
+from .clap_consistency import CLAPTextConsistencyMetric, TextConsistencyMetric
+from .chroma_cosinesim import ChromaCosineSimilarityMetric
+from .fad import FrechetAudioDistanceMetric
+from .kld import KLDivergenceMetric, PasstKLDivergenceMetric
+from .rvm import RelativeVolumeMel
+from .visqol import ViSQOL</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.metrics.chroma_cosinesim" href="chroma_cosinesim.html">audiocraft.metrics.chroma_cosinesim</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics.clap_consistency" href="clap_consistency.html">audiocraft.metrics.clap_consistency</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics.fad" href="fad.html">audiocraft.metrics.fad</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics.kld" href="kld.html">audiocraft.metrics.kld</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics.rvm" href="rvm.html">audiocraft.metrics.rvm</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.metrics.visqol" href="visqol.html">audiocraft.metrics.visqol</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.metrics.chroma_cosinesim" href="chroma_cosinesim.html">audiocraft.metrics.chroma_cosinesim</a></code></li>
+<li><code><a title="audiocraft.metrics.clap_consistency" href="clap_consistency.html">audiocraft.metrics.clap_consistency</a></code></li>
+<li><code><a title="audiocraft.metrics.fad" href="fad.html">audiocraft.metrics.fad</a></code></li>
+<li><code><a title="audiocraft.metrics.kld" href="kld.html">audiocraft.metrics.kld</a></code></li>
+<li><code><a title="audiocraft.metrics.rvm" href="rvm.html">audiocraft.metrics.rvm</a></code></li>
+<li><code><a title="audiocraft.metrics.visqol" href="visqol.html">audiocraft.metrics.visqol</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/kld.html b/api_docs/audiocraft/metrics/kld.html
new file mode 100644
index 00000000..11991eaf
--- /dev/null
+++ b/api_docs/audiocraft/metrics/kld.html
@@ -0,0 +1,712 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.kld API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.kld</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+from functools import partial
+import logging
+import os
+import typing as tp
+
+import torch
+import torchmetrics
+
+from ..data.audio_utils import convert_audio
+
+
+logger = logging.getLogger(__name__)
+
+
+class _patch_passt_stft:
+    &#34;&#34;&#34;Decorator to patch torch.stft in PaSST.&#34;&#34;&#34;
+    def __init__(self):
+        self.old_stft = torch.stft
+
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions
+        # torch is throwing RuntimeErrors when not set
+        torch.stft = partial(torch.stft, return_complex=False)
+
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+
+
+def kl_divergence(pred_probs: torch.Tensor, target_probs: torch.Tensor, epsilon: float = 1e-6) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Computes the elementwise KL-Divergence loss between probability distributions
+    from generated samples and target samples.
+
+    Args:
+        pred_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on generated audio. Expected shape is [B, num_classes].
+        target_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on target audio. Expected shape is [B, num_classes].
+        epsilon (float): Epsilon value.
+    Returns:
+        kld (torch.Tensor): KLD loss between each generated sample and target pair.
+    &#34;&#34;&#34;
+    kl_div = torch.nn.functional.kl_div((pred_probs + epsilon).log(), target_probs, reduction=&#34;none&#34;)
+    return kl_div.sum(-1)
+
+
+class KLDivergenceMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Base implementation for KL Divergence metric.
+
+    The KL divergence is measured between probability distributions
+    of class predictions returned by a pre-trained audio classification model.
+    When the KL-divergence is low, the generated audio is expected to
+    have similar acoustic characteristics as the reference audio,
+    according to the classifier.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+        self.add_state(&#34;kld_pq_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;kld_qp_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;kld_all_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0), dist_reduce_fx=&#34;sum&#34;)
+
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Get model output given provided input tensor.
+
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor): Probabilities over labels, of shape [B, num_classes].
+        &#34;&#34;&#34;
+        raise NotImplementedError(&#34;implement method to extract label distributions from the model.&#34;)
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Calculates running KL-Divergence loss between batches of audio
+        preds (generated) and target (ground-truth)
+        Args:
+            preds (torch.Tensor): Audio samples to evaluate, of shape [B, C, T].
+            targets (torch.Tensor): Target samples to compare against, of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        &#34;&#34;&#34;
+        assert preds.shape == targets.shape
+        assert preds.size(0) &gt; 0, &#34;Cannot update the loss with empty tensors&#34;
+        preds_probs = self._get_label_distribution(preds, sizes, sample_rates)
+        targets_probs = self._get_label_distribution(targets, sizes, sample_rates)
+        if preds_probs is not None and targets_probs is not None:
+            assert preds_probs.shape == targets_probs.shape
+            kld_scores = kl_divergence(preds_probs, targets_probs)
+            assert not torch.isnan(kld_scores).any(), &#34;kld_scores contains NaN value(s)!&#34;
+            self.kld_pq_sum += torch.sum(kld_scores)
+            kld_qp_scores = kl_divergence(targets_probs, preds_probs)
+            self.kld_qp_sum += torch.sum(kld_qp_scores)
+            self.weight += torch.tensor(kld_scores.size(0))
+
+    def compute(self) -&gt; dict:
+        &#34;&#34;&#34;Computes KL-Divergence across all evaluated pred/target pairs.&#34;&#34;&#34;
+        weight: float = float(self.weight.item())  # type: ignore
+        assert weight &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;
+        logger.info(f&#34;Computing KL divergence on a total of {weight} samples&#34;)
+        kld_pq = self.kld_pq_sum.item() / weight  # type: ignore
+        kld_qp = self.kld_qp_sum.item() / weight  # type: ignore
+        kld_both = kld_pq + kld_qp
+        return {&#39;kld&#39;: kld_pq, &#39;kld_pq&#39;: kld_pq, &#39;kld_qp&#39;: kld_qp, &#39;kld_both&#39;: kld_both}
+
+
+class PasstKLDivergenceMetric(KLDivergenceMetric):
+    &#34;&#34;&#34;KL-Divergence metric based on pre-trained PASST classifier on AudioSet.
+
+    From: PaSST: Efficient Training of Audio Transformers with Patchout
+    Paper: https://arxiv.org/abs/2110.05069
+    Implementation: https://github.com/kkoutini/PaSST
+
+    Follow instructions from the github repo:
+    ```
+    pip install &#39;git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt&#39;
+    ```
+
+    Args:
+        pretrained_length (float, optional): Audio duration used for the pretrained model.
+    &#34;&#34;&#34;
+    def __init__(self, pretrained_length: tp.Optional[float] = None):
+        super().__init__()
+        self._initialize_model(pretrained_length)
+
+    def _initialize_model(self, pretrained_length: tp.Optional[float] = None):
+        &#34;&#34;&#34;Initialize underlying PaSST audio classifier.&#34;&#34;&#34;
+        model, sr, max_frames, min_frames = self._load_base_model(pretrained_length)
+        self.min_input_frames = min_frames
+        self.max_input_frames = max_frames
+        self.model_sample_rate = sr
+        self.model = model
+        self.model.eval()
+        self.model.to(self.device)
+
+    def _load_base_model(self, pretrained_length: tp.Optional[float]):
+        &#34;&#34;&#34;Load pretrained model from PaSST.&#34;&#34;&#34;
+        try:
+            if pretrained_length == 30:
+                from hear21passt.base30sec import get_basic_model  # type: ignore
+                max_duration = 30
+            elif pretrained_length == 20:
+                from hear21passt.base20sec import get_basic_model  # type: ignore
+                max_duration = 20
+            else:
+                from hear21passt.base import get_basic_model  # type: ignore
+                # Original PASST was trained on AudioSet with 10s-long audio samples
+                max_duration = 10
+            min_duration = 0.15
+            min_duration = 0.15
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                &#34;Please install hear21passt to compute KL divergence: &#34;,
+                &#34;pip install &#39;git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt&#39;&#34;
+            )
+        model_sample_rate = 32_000
+        max_input_frames = int(max_duration * model_sample_rate)
+        min_input_frames = int(min_duration * model_sample_rate)
+        with open(os.devnull, &#39;w&#39;) as f, contextlib.redirect_stdout(f):
+            model = get_basic_model(mode=&#39;logits&#39;)
+        return model, model_sample_rate, max_input_frames, min_input_frames
+
+    def _process_audio(self, wav: torch.Tensor, sample_rate: int, wav_len: int) -&gt; tp.List[torch.Tensor]:
+        &#34;&#34;&#34;Process audio to feed to the pretrained model.&#34;&#34;&#34;
+        wav = wav.unsqueeze(0)
+        wav = wav[..., :wav_len]
+        wav = convert_audio(wav, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1)
+        wav = wav.squeeze(0)
+        # we don&#39;t pad but return a list of audio segments as this otherwise affects the KLD computation
+        segments = torch.split(wav, self.max_input_frames, dim=-1)
+        valid_segments = []
+        for s in segments:
+            # ignoring too small segments that are breaking the model inference
+            if s.size(-1) &gt; self.min_input_frames:
+                valid_segments.append(s)
+        return [s[None] for s in valid_segments]
+
+    def _get_model_preds(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Run the pretrained model and get the predictions.&#34;&#34;&#34;
+        assert wav.dim() == 3, f&#34;Unexpected number of dims for preprocessed wav: {wav.shape}&#34;
+        wav = wav.mean(dim=1)
+        # PaSST is printing a lot of garbage that we are not interested in
+        with open(os.devnull, &#34;w&#34;) as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = self.model(wav.to(self.device))
+                probs = torch.softmax(logits, dim=-1)
+                return probs
+
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Get model output given provided input tensor.
+
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor, optional): Probabilities over labels, of shape [B, num_classes].
+        &#34;&#34;&#34;
+        all_probs: tp.List[torch.Tensor] = []
+        for i, wav in enumerate(x):
+            sample_rate = int(sample_rates[i].item())
+            wav_len = int(sizes[i].item())
+            wav_segments = self._process_audio(wav, sample_rate, wav_len)
+            for segment in wav_segments:
+                probs = self._get_model_preds(segment).mean(dim=0)
+                all_probs.append(probs)
+        if len(all_probs) &gt; 0:
+            return torch.stack(all_probs, dim=0)
+        else:
+            return None</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.metrics.kld.kl_divergence"><code class="name flex">
+<span>def <span class="ident">kl_divergence</span></span>(<span>pred_probs: torch.Tensor, target_probs: torch.Tensor, epsilon: float = 1e-06) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Computes the elementwise KL-Divergence loss between probability distributions
+from generated samples and target samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>pred_probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Probabilities for each label obtained
+from a classifier on generated audio. Expected shape is [B, num_classes].</dd>
+<dt><strong><code>target_probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Probabilities for each label obtained
+from a classifier on target audio. Expected shape is [B, num_classes].</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>kld (torch.Tensor): KLD loss between each generated sample and target pair.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def kl_divergence(pred_probs: torch.Tensor, target_probs: torch.Tensor, epsilon: float = 1e-6) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Computes the elementwise KL-Divergence loss between probability distributions
+    from generated samples and target samples.
+
+    Args:
+        pred_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on generated audio. Expected shape is [B, num_classes].
+        target_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on target audio. Expected shape is [B, num_classes].
+        epsilon (float): Epsilon value.
+    Returns:
+        kld (torch.Tensor): KLD loss between each generated sample and target pair.
+    &#34;&#34;&#34;
+    kl_div = torch.nn.functional.kl_div((pred_probs + epsilon).log(), target_probs, reduction=&#34;none&#34;)
+    return kl_div.sum(-1)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric"><code class="flex name class">
+<span>class <span class="ident">KLDivergenceMetric</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base implementation for KL Divergence metric.</p>
+<p>The KL divergence is measured between probability distributions
+of class predictions returned by a pre-trained audio classification model.
+When the KL-divergence is low, the generated audio is expected to
+have similar acoustic characteristics as the reference audio,
+according to the classifier.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class KLDivergenceMetric(torchmetrics.Metric):
+    &#34;&#34;&#34;Base implementation for KL Divergence metric.
+
+    The KL divergence is measured between probability distributions
+    of class predictions returned by a pre-trained audio classification model.
+    When the KL-divergence is low, the generated audio is expected to
+    have similar acoustic characteristics as the reference audio,
+    according to the classifier.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+        self.add_state(&#34;kld_pq_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;kld_qp_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;kld_all_sum&#34;, default=torch.tensor(0.), dist_reduce_fx=&#34;sum&#34;)
+        self.add_state(&#34;weight&#34;, default=torch.tensor(0), dist_reduce_fx=&#34;sum&#34;)
+
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Get model output given provided input tensor.
+
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor): Probabilities over labels, of shape [B, num_classes].
+        &#34;&#34;&#34;
+        raise NotImplementedError(&#34;implement method to extract label distributions from the model.&#34;)
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+        &#34;&#34;&#34;Calculates running KL-Divergence loss between batches of audio
+        preds (generated) and target (ground-truth)
+        Args:
+            preds (torch.Tensor): Audio samples to evaluate, of shape [B, C, T].
+            targets (torch.Tensor): Target samples to compare against, of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        &#34;&#34;&#34;
+        assert preds.shape == targets.shape
+        assert preds.size(0) &gt; 0, &#34;Cannot update the loss with empty tensors&#34;
+        preds_probs = self._get_label_distribution(preds, sizes, sample_rates)
+        targets_probs = self._get_label_distribution(targets, sizes, sample_rates)
+        if preds_probs is not None and targets_probs is not None:
+            assert preds_probs.shape == targets_probs.shape
+            kld_scores = kl_divergence(preds_probs, targets_probs)
+            assert not torch.isnan(kld_scores).any(), &#34;kld_scores contains NaN value(s)!&#34;
+            self.kld_pq_sum += torch.sum(kld_scores)
+            kld_qp_scores = kl_divergence(targets_probs, preds_probs)
+            self.kld_qp_sum += torch.sum(kld_qp_scores)
+            self.weight += torch.tensor(kld_scores.size(0))
+
+    def compute(self) -&gt; dict:
+        &#34;&#34;&#34;Computes KL-Divergence across all evaluated pred/target pairs.&#34;&#34;&#34;
+        weight: float = float(self.weight.item())  # type: ignore
+        assert weight &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;
+        logger.info(f&#34;Computing KL divergence on a total of {weight} samples&#34;)
+        kld_pq = self.kld_pq_sum.item() / weight  # type: ignore
+        kld_qp = self.kld_qp_sum.item() / weight  # type: ignore
+        kld_both = kld_pq + kld_qp
+        return {&#39;kld&#39;: kld_pq, &#39;kld_pq&#39;: kld_pq, &#39;kld_qp&#39;: kld_qp, &#39;kld_both&#39;: kld_both}</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric">PasstKLDivergenceMetric</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.compute"><code class="name flex">
+<span>def <span class="ident">compute</span></span>(<span>self) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Computes KL-Divergence across all evaluated pred/target pairs.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute(self) -&gt; dict:
+    &#34;&#34;&#34;Computes KL-Divergence across all evaluated pred/target pairs.&#34;&#34;&#34;
+    weight: float = float(self.weight.item())  # type: ignore
+    assert weight &gt; 0, &#34;Unable to compute with total number of comparisons &lt;= 0&#34;
+    logger.info(f&#34;Computing KL divergence on a total of {weight} samples&#34;)
+    kld_pq = self.kld_pq_sum.item() / weight  # type: ignore
+    kld_qp = self.kld_qp_sum.item() / weight  # type: ignore
+    kld_both = kld_pq + kld_qp
+    return {&#39;kld&#39;: kld_pq, &#39;kld_pq&#39;: kld_pq, &#39;kld_qp&#39;: kld_qp, &#39;kld_both&#39;: kld_both}</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.kld.KLDivergenceMetric.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, preds: torch.Tensor, targets: torch.Tensor, sizes: torch.Tensor, sample_rates: torch.Tensor) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculates running KL-Divergence loss between batches of audio
+preds (generated) and target (ground-truth)</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>preds</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio samples to evaluate, of shape [B, C, T].</dd>
+<dt><strong><code>targets</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Target samples to compare against, of shape [B, C, T].</dd>
+<dt><strong><code>sizes</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Actual audio sample length, of shape [B].</dd>
+<dt><strong><code>sample_rates</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Actual audio sample rate, of shape [B].</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, preds: torch.Tensor, targets: torch.Tensor,
+           sizes: torch.Tensor, sample_rates: torch.Tensor) -&gt; None:
+    &#34;&#34;&#34;Calculates running KL-Divergence loss between batches of audio
+    preds (generated) and target (ground-truth)
+    Args:
+        preds (torch.Tensor): Audio samples to evaluate, of shape [B, C, T].
+        targets (torch.Tensor): Target samples to compare against, of shape [B, C, T].
+        sizes (torch.Tensor): Actual audio sample length, of shape [B].
+        sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+    &#34;&#34;&#34;
+    assert preds.shape == targets.shape
+    assert preds.size(0) &gt; 0, &#34;Cannot update the loss with empty tensors&#34;
+    preds_probs = self._get_label_distribution(preds, sizes, sample_rates)
+    targets_probs = self._get_label_distribution(targets, sizes, sample_rates)
+    if preds_probs is not None and targets_probs is not None:
+        assert preds_probs.shape == targets_probs.shape
+        kld_scores = kl_divergence(preds_probs, targets_probs)
+        assert not torch.isnan(kld_scores).any(), &#34;kld_scores contains NaN value(s)!&#34;
+        self.kld_pq_sum += torch.sum(kld_scores)
+        kld_qp_scores = kl_divergence(targets_probs, preds_probs)
+        self.kld_qp_sum += torch.sum(kld_qp_scores)
+        self.weight += torch.tensor(kld_scores.size(0))</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric"><code class="flex name class">
+<span>class <span class="ident">PasstKLDivergenceMetric</span></span>
+<span>(</span><span>pretrained_length: Optional[float] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>KL-Divergence metric based on pre-trained PASST classifier on AudioSet.</p>
+<p>From: PaSST: Efficient Training of Audio Transformers with Patchout
+Paper: <a href="https://arxiv.org/abs/2110.05069">https://arxiv.org/abs/2110.05069</a>
+Implementation: <a href="https://github.com/kkoutini/PaSST">https://github.com/kkoutini/PaSST</a></p>
+<p>Follow instructions from the github repo:</p>
+<pre><code>pip install 'git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt'
+</code></pre>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>pretrained_length</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Audio duration used for the pretrained model.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PasstKLDivergenceMetric(KLDivergenceMetric):
+    &#34;&#34;&#34;KL-Divergence metric based on pre-trained PASST classifier on AudioSet.
+
+    From: PaSST: Efficient Training of Audio Transformers with Patchout
+    Paper: https://arxiv.org/abs/2110.05069
+    Implementation: https://github.com/kkoutini/PaSST
+
+    Follow instructions from the github repo:
+    ```
+    pip install &#39;git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt&#39;
+    ```
+
+    Args:
+        pretrained_length (float, optional): Audio duration used for the pretrained model.
+    &#34;&#34;&#34;
+    def __init__(self, pretrained_length: tp.Optional[float] = None):
+        super().__init__()
+        self._initialize_model(pretrained_length)
+
+    def _initialize_model(self, pretrained_length: tp.Optional[float] = None):
+        &#34;&#34;&#34;Initialize underlying PaSST audio classifier.&#34;&#34;&#34;
+        model, sr, max_frames, min_frames = self._load_base_model(pretrained_length)
+        self.min_input_frames = min_frames
+        self.max_input_frames = max_frames
+        self.model_sample_rate = sr
+        self.model = model
+        self.model.eval()
+        self.model.to(self.device)
+
+    def _load_base_model(self, pretrained_length: tp.Optional[float]):
+        &#34;&#34;&#34;Load pretrained model from PaSST.&#34;&#34;&#34;
+        try:
+            if pretrained_length == 30:
+                from hear21passt.base30sec import get_basic_model  # type: ignore
+                max_duration = 30
+            elif pretrained_length == 20:
+                from hear21passt.base20sec import get_basic_model  # type: ignore
+                max_duration = 20
+            else:
+                from hear21passt.base import get_basic_model  # type: ignore
+                # Original PASST was trained on AudioSet with 10s-long audio samples
+                max_duration = 10
+            min_duration = 0.15
+            min_duration = 0.15
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                &#34;Please install hear21passt to compute KL divergence: &#34;,
+                &#34;pip install &#39;git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt&#39;&#34;
+            )
+        model_sample_rate = 32_000
+        max_input_frames = int(max_duration * model_sample_rate)
+        min_input_frames = int(min_duration * model_sample_rate)
+        with open(os.devnull, &#39;w&#39;) as f, contextlib.redirect_stdout(f):
+            model = get_basic_model(mode=&#39;logits&#39;)
+        return model, model_sample_rate, max_input_frames, min_input_frames
+
+    def _process_audio(self, wav: torch.Tensor, sample_rate: int, wav_len: int) -&gt; tp.List[torch.Tensor]:
+        &#34;&#34;&#34;Process audio to feed to the pretrained model.&#34;&#34;&#34;
+        wav = wav.unsqueeze(0)
+        wav = wav[..., :wav_len]
+        wav = convert_audio(wav, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1)
+        wav = wav.squeeze(0)
+        # we don&#39;t pad but return a list of audio segments as this otherwise affects the KLD computation
+        segments = torch.split(wav, self.max_input_frames, dim=-1)
+        valid_segments = []
+        for s in segments:
+            # ignoring too small segments that are breaking the model inference
+            if s.size(-1) &gt; self.min_input_frames:
+                valid_segments.append(s)
+        return [s[None] for s in valid_segments]
+
+    def _get_model_preds(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Run the pretrained model and get the predictions.&#34;&#34;&#34;
+        assert wav.dim() == 3, f&#34;Unexpected number of dims for preprocessed wav: {wav.shape}&#34;
+        wav = wav.mean(dim=1)
+        # PaSST is printing a lot of garbage that we are not interested in
+        with open(os.devnull, &#34;w&#34;) as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = self.model(wav.to(self.device))
+                probs = torch.softmax(logits, dim=-1)
+                return probs
+
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Get model output given provided input tensor.
+
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor, optional): Probabilities over labels, of shape [B, num_classes].
+        &#34;&#34;&#34;
+        all_probs: tp.List[torch.Tensor] = []
+        for i, wav in enumerate(x):
+            sample_rate = int(sample_rates[i].item())
+            wav_len = int(sizes[i].item())
+            wav_segments = self._process_audio(wav, sample_rate, wav_len)
+            for segment in wav_segments:
+                probs = self._get_model_preds(segment).mean(dim=0)
+                all_probs.append(probs)
+        if len(all_probs) &gt; 0:
+            return torch.stack(all_probs, dim=0)
+        else:
+            return None</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.metrics.kld.KLDivergenceMetric" href="#audiocraft.metrics.kld.KLDivergenceMetric">KLDivergenceMetric</a></li>
+<li>torchmetrics.metric.Metric</li>
+<li>torch.nn.modules.module.Module</li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.full_state_update"><code class="name">var <span class="ident">full_state_update</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.higher_is_better"><code class="name">var <span class="ident">higher_is_better</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.is_differentiable"><code class="name">var <span class="ident">is_differentiable</span> : Optional[bool]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_legend_name"><code class="name">var <span class="ident">plot_legend_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_lower_bound"><code class="name">var <span class="ident">plot_lower_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_upper_bound"><code class="name">var <span class="ident">plot_upper_bound</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.metrics.kld.KLDivergenceMetric" href="#audiocraft.metrics.kld.KLDivergenceMetric">KLDivergenceMetric</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.compute" href="#audiocraft.metrics.kld.KLDivergenceMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.update" href="#audiocraft.metrics.kld.KLDivergenceMetric.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.metrics.kld.kl_divergence" href="#audiocraft.metrics.kld.kl_divergence">kl_divergence</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.kld.KLDivergenceMetric" href="#audiocraft.metrics.kld.KLDivergenceMetric">KLDivergenceMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.compute" href="#audiocraft.metrics.kld.KLDivergenceMetric.compute">compute</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.full_state_update" href="#audiocraft.metrics.kld.KLDivergenceMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.higher_is_better" href="#audiocraft.metrics.kld.KLDivergenceMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.is_differentiable" href="#audiocraft.metrics.kld.KLDivergenceMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.plot_legend_name" href="#audiocraft.metrics.kld.KLDivergenceMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.plot_lower_bound" href="#audiocraft.metrics.kld.KLDivergenceMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.plot_upper_bound" href="#audiocraft.metrics.kld.KLDivergenceMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.KLDivergenceMetric.update" href="#audiocraft.metrics.kld.KLDivergenceMetric.update">update</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric">PasstKLDivergenceMetric</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.full_state_update" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.full_state_update">full_state_update</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.higher_is_better" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.higher_is_better">higher_is_better</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.is_differentiable" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.is_differentiable">is_differentiable</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_legend_name" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_legend_name">plot_legend_name</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_lower_bound" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_lower_bound">plot_lower_bound</a></code></li>
+<li><code><a title="audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_upper_bound" href="#audiocraft.metrics.kld.PasstKLDivergenceMetric.plot_upper_bound">plot_upper_bound</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/rvm.html b/api_docs/audiocraft/metrics/rvm.html
new file mode 100644
index 00000000..5c4b98a4
--- /dev/null
+++ b/api_docs/audiocraft/metrics/rvm.html
@@ -0,0 +1,447 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.rvm API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.rvm</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+import torch
+from torch import nn
+import torchaudio
+
+
+def db_to_scale(volume: tp.Union[float, torch.Tensor]):
+    return 10 ** (volume / 20)
+
+
+def scale_to_db(scale: torch.Tensor, min_volume: float = -120):
+    min_scale = db_to_scale(min_volume)
+    return 20 * torch.log10(scale.clamp(min=min_scale))
+
+
+class RelativeVolumeMel(nn.Module):
+    &#34;&#34;&#34;Relative volume melspectrogram measure.
+
+    Computes a measure of distance over two mel spectrogram that is interpretable in terms
+    of decibels. Given `x_ref` and `x_est` two waveforms of shape `[*, T]`, it will
+    first renormalize both by the ground truth of `x_ref`.
+
+    ..Warning:: This class returns the volume of the distortion at the spectrogram level,
+        e.g. low negative values reflects lower distortion levels. For a SNR (like reported
+        in the MultiBandDiffusion paper), just take `-rvm`.
+
+    Then it computes the mel spectrogram `z_ref` and `z_est` and compute volume of the difference
+    relative to the volume of `z_ref` for each time-frequency bin. It further adds some limits, e.g.
+    clamping the values between -25 and 25 dB (controlled by `min_relative_volume` and `max_relative_volume`)
+    with the goal of avoiding the loss being dominated by parts where the reference is almost silent.
+    Indeed, volumes in dB can take unbounded values both towards -oo and +oo, which can make the final
+    average metric harder to interpret. Besides, anything below -30 dB of attenuation would sound extremely
+    good (for a neural network output, although sound engineers typically aim for much lower attenuations).
+    Similarly, anything above +30 dB would just be completely missing the target, and there is no point
+    in measuring by exactly how much it missed it. -25, 25 is a more conservative range, but also more
+    in line with what neural nets currently can achieve.
+
+    For instance, a Relative Volume Mel (RVM) score of -10 dB means that on average, the delta between
+    the target and reference mel-spec is 10 dB lower than the reference mel-spec value.
+
+    The metric can be aggregated over a given frequency band in order have different insights for
+    different region of the spectrum. `num_aggregated_bands` controls the number of bands.
+
+    ..Warning:: While this function is optimized for interpretability, nothing was done to ensure it
+        is numerically stable when computing its gradient. We thus advise against using it as a training loss.
+
+    Args:
+        sample_rate (int): Sample rate of the input audio.
+        n_mels (int): Number of mel bands to use.
+        n_fft (int): Number of frequency bins for the STFT.
+        hop_length (int): Hop length of the STFT and the mel-spectrogram.
+        min_relative_volume (float): The error `z_ref - z_est` volume is given relative to
+            the volume of `z_ref`. If error is smaller than -25 dB of `z_ref`, then it is clamped.
+        max_relative_volume (float): Same as `min_relative_volume` but clamping if the error is larger than that.
+        max_initial_gain (float): When rescaling the audio at the very beginning, we will limit the gain
+            to that amount, to avoid rescaling near silence. Given in dB.
+        min_activity_volume (float): When computing the reference level from `z_ref`, will clamp low volume
+            bins to that amount. This is effectively our &#34;zero&#34; level for the reference mel-spectrogram,
+            and anything below that will be considered equally.
+        num_aggregated_bands (int): Number of bands to keep when computing the average RVM value.
+            For instance, a value of 3 would give 3 scores, roughly for low, mid and high freqs.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int = 24000, n_mels: int = 80, n_fft: int = 512,
+                 hop_length: int = 128, min_relative_volume: float = -25,
+                 max_relative_volume: float = 25, max_initial_gain: float = 25,
+                 min_activity_volume: float = -25,
+                 num_aggregated_bands: int = 4) -&gt; None:
+        super().__init__()
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
+            normalized=True, sample_rate=sample_rate, power=2)
+        self.min_relative_volume = min_relative_volume
+        self.max_relative_volume = max_relative_volume
+        self.max_initial_gain = max_initial_gain
+        self.min_activity_volume = min_activity_volume
+        self.num_aggregated_bands = num_aggregated_bands
+
+    def forward(self, estimate: torch.Tensor, ground_truth: torch.Tensor) -&gt; tp.Dict[str, torch.Tensor]:
+        &#34;&#34;&#34;Compute RVM metric between estimate and reference samples.
+
+        Args:
+            estimate (torch.Tensor): Estimate sample.
+            ground_truth (torch.Tensor): Reference sample.
+
+        Returns:
+            dict[str, torch.Tensor]: Metrics with keys `rvm` for the overall average, and `rvm_{k}`
+            for the RVM over the k-th band (k=0..num_aggregated_bands - 1).
+        &#34;&#34;&#34;
+        min_scale = db_to_scale(-self.max_initial_gain)
+        std = ground_truth.pow(2).mean().sqrt().clamp(min=min_scale)
+        z_gt = self.melspec(ground_truth / std).sqrt()
+        z_est = self.melspec(estimate / std).sqrt()
+
+        delta = z_gt - z_est
+        ref_db = scale_to_db(z_gt, self.min_activity_volume)
+        delta_db = scale_to_db(delta.abs(), min_volume=-120)
+        relative_db = (delta_db - ref_db).clamp(self.min_relative_volume, self.max_relative_volume)
+        dims = list(range(relative_db.dim()))
+        dims.remove(dims[-2])
+        losses_per_band = relative_db.mean(dim=dims)
+        aggregated = [chunk.mean() for chunk in losses_per_band.chunk(self.num_aggregated_bands, dim=0)]
+        metrics = {f&#39;rvm_{index}&#39;: value for index, value in enumerate(aggregated)}
+        metrics[&#39;rvm&#39;] = losses_per_band.mean()
+        return metrics</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.metrics.rvm.db_to_scale"><code class="name flex">
+<span>def <span class="ident">db_to_scale</span></span>(<span>volume: Union[float, torch.Tensor])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def db_to_scale(volume: tp.Union[float, torch.Tensor]):
+    return 10 ** (volume / 20)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.metrics.rvm.scale_to_db"><code class="name flex">
+<span>def <span class="ident">scale_to_db</span></span>(<span>scale: torch.Tensor, min_volume: float = -120)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def scale_to_db(scale: torch.Tensor, min_volume: float = -120):
+    min_scale = db_to_scale(min_volume)
+    return 20 * torch.log10(scale.clamp(min=min_scale))</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.rvm.RelativeVolumeMel"><code class="flex name class">
+<span>class <span class="ident">RelativeVolumeMel</span></span>
+<span>(</span><span>sample_rate: int = 24000, n_mels: int = 80, n_fft: int = 512, hop_length: int = 128, min_relative_volume: float = -25, max_relative_volume: float = 25, max_initial_gain: float = 25, min_activity_volume: float = -25, num_aggregated_bands: int = 4)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Relative volume melspectrogram measure.</p>
+<p>Computes a measure of distance over two mel spectrogram that is interpretable in terms
+of decibels. Given <code>x_ref</code> and <code>x_est</code> two waveforms of shape <code>[*, T]</code>, it will
+first renormalize both by the ground truth of <code>x_ref</code>.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;This class returns the volume of the distortion at the spectrogram level,</p>
+<p>e.g. low negative values reflects lower distortion levels. For a SNR (like reported
+in the MultiBandDiffusion paper), just take <code>-rvm</code>.</p>
+</div>
+<p>Then it computes the mel spectrogram <code>z_ref</code> and <code>z_est</code> and compute volume of the difference
+relative to the volume of <code>z_ref</code> for each time-frequency bin. It further adds some limits, e.g.
+clamping the values between -25 and 25 dB (controlled by <code>min_relative_volume</code> and <code>max_relative_volume</code>)
+with the goal of avoiding the loss being dominated by parts where the reference is almost silent.
+Indeed, volumes in dB can take unbounded values both towards -oo and +oo, which can make the final
+average metric harder to interpret. Besides, anything below -30 dB of attenuation would sound extremely
+good (for a neural network output, although sound engineers typically aim for much lower attenuations).
+Similarly, anything above +30 dB would just be completely missing the target, and there is no point
+in measuring by exactly how much it missed it. -25, 25 is a more conservative range, but also more
+in line with what neural nets currently can achieve.</p>
+<p>For instance, a Relative Volume Mel (RVM) score of -10 dB means that on average, the delta between
+the target and reference mel-spec is 10 dB lower than the reference mel-spec value.</p>
+<p>The metric can be aggregated over a given frequency band in order have different insights for
+different region of the spectrum. <code>num_aggregated_bands</code> controls the number of bands.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;While this function is optimized for interpretability, nothing was done to ensure it</p>
+<p>is numerically stable when computing its gradient. We thus advise against using it as a training loss.</p>
+</div>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate of the input audio.</dd>
+<dt><strong><code>n_mels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel bands to use.</dd>
+<dt><strong><code>n_fft</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of frequency bins for the STFT.</dd>
+<dt><strong><code>hop_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hop length of the STFT and the mel-spectrogram.</dd>
+<dt><strong><code>min_relative_volume</code></strong> :&ensp;<code>float</code></dt>
+<dd>The error <code>z_ref - z_est</code> volume is given relative to
+the volume of <code>z_ref</code>. If error is smaller than -25 dB of <code>z_ref</code>, then it is clamped.</dd>
+<dt><strong><code>max_relative_volume</code></strong> :&ensp;<code>float</code></dt>
+<dd>Same as <code>min_relative_volume</code> but clamping if the error is larger than that.</dd>
+<dt><strong><code>max_initial_gain</code></strong> :&ensp;<code>float</code></dt>
+<dd>When rescaling the audio at the very beginning, we will limit the gain
+to that amount, to avoid rescaling near silence. Given in dB.</dd>
+<dt><strong><code>min_activity_volume</code></strong> :&ensp;<code>float</code></dt>
+<dd>When computing the reference level from <code>z_ref</code>, will clamp low volume
+bins to that amount. This is effectively our "zero" level for the reference mel-spectrogram,
+and anything below that will be considered equally.</dd>
+<dt><strong><code>num_aggregated_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of bands to keep when computing the average RVM value.
+For instance, a value of 3 would give 3 scores, roughly for low, mid and high freqs.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class RelativeVolumeMel(nn.Module):
+    &#34;&#34;&#34;Relative volume melspectrogram measure.
+
+    Computes a measure of distance over two mel spectrogram that is interpretable in terms
+    of decibels. Given `x_ref` and `x_est` two waveforms of shape `[*, T]`, it will
+    first renormalize both by the ground truth of `x_ref`.
+
+    ..Warning:: This class returns the volume of the distortion at the spectrogram level,
+        e.g. low negative values reflects lower distortion levels. For a SNR (like reported
+        in the MultiBandDiffusion paper), just take `-rvm`.
+
+    Then it computes the mel spectrogram `z_ref` and `z_est` and compute volume of the difference
+    relative to the volume of `z_ref` for each time-frequency bin. It further adds some limits, e.g.
+    clamping the values between -25 and 25 dB (controlled by `min_relative_volume` and `max_relative_volume`)
+    with the goal of avoiding the loss being dominated by parts where the reference is almost silent.
+    Indeed, volumes in dB can take unbounded values both towards -oo and +oo, which can make the final
+    average metric harder to interpret. Besides, anything below -30 dB of attenuation would sound extremely
+    good (for a neural network output, although sound engineers typically aim for much lower attenuations).
+    Similarly, anything above +30 dB would just be completely missing the target, and there is no point
+    in measuring by exactly how much it missed it. -25, 25 is a more conservative range, but also more
+    in line with what neural nets currently can achieve.
+
+    For instance, a Relative Volume Mel (RVM) score of -10 dB means that on average, the delta between
+    the target and reference mel-spec is 10 dB lower than the reference mel-spec value.
+
+    The metric can be aggregated over a given frequency band in order have different insights for
+    different region of the spectrum. `num_aggregated_bands` controls the number of bands.
+
+    ..Warning:: While this function is optimized for interpretability, nothing was done to ensure it
+        is numerically stable when computing its gradient. We thus advise against using it as a training loss.
+
+    Args:
+        sample_rate (int): Sample rate of the input audio.
+        n_mels (int): Number of mel bands to use.
+        n_fft (int): Number of frequency bins for the STFT.
+        hop_length (int): Hop length of the STFT and the mel-spectrogram.
+        min_relative_volume (float): The error `z_ref - z_est` volume is given relative to
+            the volume of `z_ref`. If error is smaller than -25 dB of `z_ref`, then it is clamped.
+        max_relative_volume (float): Same as `min_relative_volume` but clamping if the error is larger than that.
+        max_initial_gain (float): When rescaling the audio at the very beginning, we will limit the gain
+            to that amount, to avoid rescaling near silence. Given in dB.
+        min_activity_volume (float): When computing the reference level from `z_ref`, will clamp low volume
+            bins to that amount. This is effectively our &#34;zero&#34; level for the reference mel-spectrogram,
+            and anything below that will be considered equally.
+        num_aggregated_bands (int): Number of bands to keep when computing the average RVM value.
+            For instance, a value of 3 would give 3 scores, roughly for low, mid and high freqs.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int = 24000, n_mels: int = 80, n_fft: int = 512,
+                 hop_length: int = 128, min_relative_volume: float = -25,
+                 max_relative_volume: float = 25, max_initial_gain: float = 25,
+                 min_activity_volume: float = -25,
+                 num_aggregated_bands: int = 4) -&gt; None:
+        super().__init__()
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
+            normalized=True, sample_rate=sample_rate, power=2)
+        self.min_relative_volume = min_relative_volume
+        self.max_relative_volume = max_relative_volume
+        self.max_initial_gain = max_initial_gain
+        self.min_activity_volume = min_activity_volume
+        self.num_aggregated_bands = num_aggregated_bands
+
+    def forward(self, estimate: torch.Tensor, ground_truth: torch.Tensor) -&gt; tp.Dict[str, torch.Tensor]:
+        &#34;&#34;&#34;Compute RVM metric between estimate and reference samples.
+
+        Args:
+            estimate (torch.Tensor): Estimate sample.
+            ground_truth (torch.Tensor): Reference sample.
+
+        Returns:
+            dict[str, torch.Tensor]: Metrics with keys `rvm` for the overall average, and `rvm_{k}`
+            for the RVM over the k-th band (k=0..num_aggregated_bands - 1).
+        &#34;&#34;&#34;
+        min_scale = db_to_scale(-self.max_initial_gain)
+        std = ground_truth.pow(2).mean().sqrt().clamp(min=min_scale)
+        z_gt = self.melspec(ground_truth / std).sqrt()
+        z_est = self.melspec(estimate / std).sqrt()
+
+        delta = z_gt - z_est
+        ref_db = scale_to_db(z_gt, self.min_activity_volume)
+        delta_db = scale_to_db(delta.abs(), min_volume=-120)
+        relative_db = (delta_db - ref_db).clamp(self.min_relative_volume, self.max_relative_volume)
+        dims = list(range(relative_db.dim()))
+        dims.remove(dims[-2])
+        losses_per_band = relative_db.mean(dim=dims)
+        aggregated = [chunk.mean() for chunk in losses_per_band.chunk(self.num_aggregated_bands, dim=0)]
+        metrics = {f&#39;rvm_{index}&#39;: value for index, value in enumerate(aggregated)}
+        metrics[&#39;rvm&#39;] = losses_per_band.mean()
+        return metrics</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.rvm.RelativeVolumeMel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.rvm.RelativeVolumeMel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.rvm.RelativeVolumeMel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.metrics.rvm.RelativeVolumeMel.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, estimate: torch.Tensor, ground_truth: torch.Tensor) ‑> Dict[str, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute RVM metric between estimate and reference samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>estimate</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Estimate sample.</dd>
+<dt><strong><code>ground_truth</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Reference sample.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>dict[str, torch.Tensor]</code></dt>
+<dd>Metrics with keys <code>rvm</code> for the overall average, and <code>rvm_{k}</code></dd>
+</dl>
+<p>for the RVM over the k-th band (k=0..num_aggregated_bands - 1).</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, estimate: torch.Tensor, ground_truth: torch.Tensor) -&gt; tp.Dict[str, torch.Tensor]:
+    &#34;&#34;&#34;Compute RVM metric between estimate and reference samples.
+
+    Args:
+        estimate (torch.Tensor): Estimate sample.
+        ground_truth (torch.Tensor): Reference sample.
+
+    Returns:
+        dict[str, torch.Tensor]: Metrics with keys `rvm` for the overall average, and `rvm_{k}`
+        for the RVM over the k-th band (k=0..num_aggregated_bands - 1).
+    &#34;&#34;&#34;
+    min_scale = db_to_scale(-self.max_initial_gain)
+    std = ground_truth.pow(2).mean().sqrt().clamp(min=min_scale)
+    z_gt = self.melspec(ground_truth / std).sqrt()
+    z_est = self.melspec(estimate / std).sqrt()
+
+    delta = z_gt - z_est
+    ref_db = scale_to_db(z_gt, self.min_activity_volume)
+    delta_db = scale_to_db(delta.abs(), min_volume=-120)
+    relative_db = (delta_db - ref_db).clamp(self.min_relative_volume, self.max_relative_volume)
+    dims = list(range(relative_db.dim()))
+    dims.remove(dims[-2])
+    losses_per_band = relative_db.mean(dim=dims)
+    aggregated = [chunk.mean() for chunk in losses_per_band.chunk(self.num_aggregated_bands, dim=0)]
+    metrics = {f&#39;rvm_{index}&#39;: value for index, value in enumerate(aggregated)}
+    metrics[&#39;rvm&#39;] = losses_per_band.mean()
+    return metrics</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.metrics.rvm.db_to_scale" href="#audiocraft.metrics.rvm.db_to_scale">db_to_scale</a></code></li>
+<li><code><a title="audiocraft.metrics.rvm.scale_to_db" href="#audiocraft.metrics.rvm.scale_to_db">scale_to_db</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.rvm.RelativeVolumeMel" href="#audiocraft.metrics.rvm.RelativeVolumeMel">RelativeVolumeMel</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.metrics.rvm.RelativeVolumeMel.call_super_init" href="#audiocraft.metrics.rvm.RelativeVolumeMel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.metrics.rvm.RelativeVolumeMel.dump_patches" href="#audiocraft.metrics.rvm.RelativeVolumeMel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.metrics.rvm.RelativeVolumeMel.forward" href="#audiocraft.metrics.rvm.RelativeVolumeMel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.metrics.rvm.RelativeVolumeMel.training" href="#audiocraft.metrics.rvm.RelativeVolumeMel.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/metrics/visqol.html b/api_docs/audiocraft/metrics/visqol.html
new file mode 100644
index 00000000..edd75990
--- /dev/null
+++ b/api_docs/audiocraft/metrics/visqol.html
@@ -0,0 +1,550 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.metrics.visqol API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.metrics.visqol</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import json
+import logging
+from pathlib import Path
+import tempfile
+import typing as tp
+import subprocess
+import shutil
+
+import torch
+import torchaudio
+
+logger = logging.getLogger(__name__)
+
+
+class ViSQOL:
+    &#34;&#34;&#34;ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.
+
+    To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
+    instructions available in the open source repository: https://github.com/google/visqol
+
+    ViSQOL is capable of running in two modes:
+
+    Audio Mode:
+        When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Audio mode uses support vector regression, with the maximum range at ~4.75.
+
+    Speech Mode:
+        When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
+            Input should be resampled to 16kHz.
+        As part of the speech mode processing, a root mean square implementation for voice activity detection
+            is performed on the reference signal to determine what parts of the signal have voice activity and
+            should therefore be included in the comparison. The signal is normalized before performing the voice
+            activity detection.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.
+
+    For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input
+
+    Args:
+        visqol_bin (str): Path to the ViSQOL binary.
+        mode (str): ViSQOL computation mode, expecting &#34;audio&#34; or &#34;speech&#34;.
+        model (str): Name of the model to use for similarity to quality model.
+        debug (bool): Whether to also get debug metrics from ViSQOL or not.
+    &#34;&#34;&#34;
+    SAMPLE_RATES_MODES = {&#34;audio&#34;: 48_000, &#34;speech&#34;: 16_000}
+    ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())
+
+    def __init__(self, bin: tp.Union[Path, str], mode: str = &#34;audio&#34;,
+                 model: str = &#34;libsvm_nu_svr_model.txt&#34;, debug: bool = False):
+        assert bin is not None and Path(bin).exists(), f&#34;Could not find ViSQOL binary in specified path: {bin}&#34;
+        self.visqol_bin = str(bin)
+        self.visqol_mode = mode
+        self.target_sr = self._get_target_sr(self.visqol_mode)
+        self.model = model
+        self.debug = debug
+        assert Path(self.visqol_model).exists(), \
+            f&#34;Could not find the specified model in ViSQOL install: {self.visqol_model}&#34;
+
+    def _get_target_sr(self, mode: str) -&gt; int:
+        # returns target sampling rate for the corresponding ViSQOL mode.
+        if mode not in ViSQOL.SAMPLE_RATES_MODES:
+            raise ValueError(
+                f&#34;Unsupported mode! Allowed are: {&#39;, &#39;.join(ViSQOL.SAMPLE_RATES_MODES.keys())}&#34;
+            )
+        return ViSQOL.SAMPLE_RATES_MODES[mode]
+
+    def _prepare_files(
+        self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
+    ):
+        # prepare files for ViSQOL evaluation.
+        assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
+        assert len(ref_sig) == len(deg_sig), (
+            &#34;Expects same number of ref and degraded inputs&#34;,
+            f&#34; but ref len {len(ref_sig)} != deg len {len(deg_sig)}&#34;
+        )
+        # resample audio if needed
+        if sr != target_sr:
+            transform = torchaudio.transforms.Resample(sr, target_sr)
+            pad = int(0.5 * target_sr)
+            rs_ref = []
+            rs_deg = []
+            for i in range(len(ref_sig)):
+                rs_ref_i = transform(ref_sig[i])
+                rs_deg_i = transform(deg_sig[i])
+                if pad_with_silence:
+                    rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode=&#39;constant&#39;, value=0)
+                    rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode=&#39;constant&#39;, value=0)
+                rs_ref.append(rs_ref_i)
+                rs_deg.append(rs_deg_i)
+            ref_sig = torch.stack(rs_ref)
+            deg_sig = torch.stack(rs_deg)
+        # save audio chunks to tmp dir and create csv
+        tmp_dir = Path(tempfile.mkdtemp())
+        try:
+            tmp_input_csv_path = tmp_dir / &#34;input.csv&#34;
+            tmp_results_csv_path = tmp_dir / &#34;results.csv&#34;
+            tmp_debug_json_path = tmp_dir / &#34;debug.json&#34;
+            with open(tmp_input_csv_path, &#34;w&#34;) as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow([&#34;reference&#34;, &#34;degraded&#34;])
+                for i in range(len(ref_sig)):
+                    tmp_ref_filename = tmp_dir / f&#34;ref_{i}.wav&#34;
+                    tmp_deg_filename = tmp_dir / f&#34;deg_{i}.wav&#34;
+                    torchaudio.save(
+                        tmp_ref_filename,
+                        torch.clamp(ref_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding=&#34;PCM_S&#34;
+                    )
+                    torchaudio.save(
+                        tmp_deg_filename,
+                        torch.clamp(deg_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding=&#34;PCM_S&#34;
+                    )
+                    csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
+            return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
+        except Exception as e:
+            logger.error(&#34;Exception occurred when preparing files for ViSQOL: %s&#34;, e)
+            return tmp_dir, None, None, None
+
+    def _flush_files(self, tmp_dir: tp.Union[Path, str]):
+        # flush tmp files used to compute ViSQOL.
+        shutil.rmtree(str(tmp_dir))
+
+    def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -&gt; float:
+        # collect results for each evaluated pair and return averaged moslqo score.
+        with open(results_csv_path, &#34;r&#34;) as csv_file:
+            reader = csv.DictReader(csv_file)
+            moslqo_scores = [float(row[&#34;moslqo&#34;]) for row in reader]
+            if len(moslqo_scores) &gt; 0:
+                return sum(moslqo_scores) / len(moslqo_scores)
+            else:
+                return 0.0
+
+    def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -&gt; dict:
+        # collect debug data for the visqol inference.
+        with open(debug_json_path, &#34;r&#34;) as f:
+            data = json.load(f)
+            return data
+
+    @property
+    def visqol_model(self):
+        return f&#39;{self.visqol_bin}/model/{self.model}&#39;
+
+    def _run_visqol(
+        self,
+        input_csv_path: tp.Union[Path, str],
+        results_csv_path: tp.Union[Path, str],
+        debug_csv_path: tp.Optional[tp.Union[Path, str]],
+    ):
+        input_csv_path = str(input_csv_path)
+        results_csv_path = str(results_csv_path)
+        debug_csv_path = str(debug_csv_path)
+        cmd = [
+            f&#39;{self.visqol_bin}/bazel-bin/visqol&#39;,
+            &#39;--batch_input_csv&#39;, f&#39;{input_csv_path}&#39;,
+            &#39;--results_csv&#39;, f&#39;{results_csv_path}&#39;
+        ]
+        if debug_csv_path is not None:
+            cmd += [&#39;--output_debug&#39;, f&#39;{debug_csv_path}&#39;]
+        if self.visqol_mode == &#34;speech&#34;:
+            cmd += [&#39;--use_speech_mode&#39;]
+        cmd += [&#39;--similarity_to_quality_model&#39;, f&#39;{self.visqol_model}&#39;]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode:
+            logger.error(&#34;Error with visqol: \n %s \n %s&#34;, result.stdout.decode(), result.stderr.decode())
+            raise RuntimeError(&#34;Error while executing visqol&#34;)
+        result.check_returncode()
+
+    def __call__(
+        self,
+        ref_sig: torch.Tensor,
+        deg_sig: torch.Tensor,
+        sr: int,
+        pad_with_silence: bool = False,
+    ):
+        &#34;&#34;&#34;Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
+        Args:
+            ref_sig (torch.Tensor): Reference signals as [B, C, T].
+            deg_sig (torch.Tensor): Degraded signals as [B, C, T].
+            sr (int): Sample rate of the two audio signals.
+            pad_with_silence (bool): Whether to pad the file with silences as recommended
+                in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
+        Returns:
+            float: The ViSQOL score or mean score for the batch.
+        &#34;&#34;&#34;
+        logger.debug(f&#34;Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples&#34;)
+        tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
+            ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
+        )
+        try:
+            if input_csv and results_csv:
+                self._run_visqol(
+                    input_csv,
+                    results_csv,
+                    debug_json if self.debug else None,
+                )
+                mosqol = self._collect_moslqo_score(results_csv)
+                return mosqol
+            else:
+                raise RuntimeError(&#34;Something unexpected happened when running VISQOL!&#34;)
+        except Exception as e:
+            logger.error(&#34;Exception occurred when running ViSQOL: %s&#34;, e)
+        finally:
+            self._flush_files(tmp_dir)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.metrics.visqol.ViSQOL"><code class="flex name class">
+<span>class <span class="ident">ViSQOL</span></span>
+<span>(</span><span>bin: Union[str, pathlib.Path], mode: str = 'audio', model: str = 'libsvm_nu_svr_model.txt', debug: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.</p>
+<p>To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
+instructions available in the open source repository: <a href="https://github.com/google/visqol">https://github.com/google/visqol</a></p>
+<p>ViSQOL is capable of running in two modes:</p>
+<p>Audio Mode:
+When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
+Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+Audio mode uses support vector regression, with the maximum range at ~4.75.</p>
+<p>Speech Mode:
+When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
+Input should be resampled to 16kHz.
+As part of the speech mode processing, a root mean square implementation for voice activity detection
+is performed on the reference signal to determine what parts of the signal have voice activity and
+should therefore be included in the comparison. The signal is normalized before performing the voice
+activity detection.
+Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.</p>
+<p>For more details, check the guidelines: <a href="https://github.com/google/visqol#general-guidelines-for-input">https://github.com/google/visqol#general-guidelines-for-input</a></p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>visqol_bin</code></strong> :&ensp;<code>str</code></dt>
+<dd>Path to the ViSQOL binary.</dd>
+<dt><strong><code>mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>ViSQOL computation mode, expecting "audio" or "speech".</dd>
+<dt><strong><code>model</code></strong> :&ensp;<code>str</code></dt>
+<dd>Name of the model to use for similarity to quality model.</dd>
+<dt><strong><code>debug</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to also get debug metrics from ViSQOL or not.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ViSQOL:
+    &#34;&#34;&#34;ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.
+
+    To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
+    instructions available in the open source repository: https://github.com/google/visqol
+
+    ViSQOL is capable of running in two modes:
+
+    Audio Mode:
+        When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Audio mode uses support vector regression, with the maximum range at ~4.75.
+
+    Speech Mode:
+        When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
+            Input should be resampled to 16kHz.
+        As part of the speech mode processing, a root mean square implementation for voice activity detection
+            is performed on the reference signal to determine what parts of the signal have voice activity and
+            should therefore be included in the comparison. The signal is normalized before performing the voice
+            activity detection.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.
+
+    For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input
+
+    Args:
+        visqol_bin (str): Path to the ViSQOL binary.
+        mode (str): ViSQOL computation mode, expecting &#34;audio&#34; or &#34;speech&#34;.
+        model (str): Name of the model to use for similarity to quality model.
+        debug (bool): Whether to also get debug metrics from ViSQOL or not.
+    &#34;&#34;&#34;
+    SAMPLE_RATES_MODES = {&#34;audio&#34;: 48_000, &#34;speech&#34;: 16_000}
+    ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())
+
+    def __init__(self, bin: tp.Union[Path, str], mode: str = &#34;audio&#34;,
+                 model: str = &#34;libsvm_nu_svr_model.txt&#34;, debug: bool = False):
+        assert bin is not None and Path(bin).exists(), f&#34;Could not find ViSQOL binary in specified path: {bin}&#34;
+        self.visqol_bin = str(bin)
+        self.visqol_mode = mode
+        self.target_sr = self._get_target_sr(self.visqol_mode)
+        self.model = model
+        self.debug = debug
+        assert Path(self.visqol_model).exists(), \
+            f&#34;Could not find the specified model in ViSQOL install: {self.visqol_model}&#34;
+
+    def _get_target_sr(self, mode: str) -&gt; int:
+        # returns target sampling rate for the corresponding ViSQOL mode.
+        if mode not in ViSQOL.SAMPLE_RATES_MODES:
+            raise ValueError(
+                f&#34;Unsupported mode! Allowed are: {&#39;, &#39;.join(ViSQOL.SAMPLE_RATES_MODES.keys())}&#34;
+            )
+        return ViSQOL.SAMPLE_RATES_MODES[mode]
+
+    def _prepare_files(
+        self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
+    ):
+        # prepare files for ViSQOL evaluation.
+        assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
+        assert len(ref_sig) == len(deg_sig), (
+            &#34;Expects same number of ref and degraded inputs&#34;,
+            f&#34; but ref len {len(ref_sig)} != deg len {len(deg_sig)}&#34;
+        )
+        # resample audio if needed
+        if sr != target_sr:
+            transform = torchaudio.transforms.Resample(sr, target_sr)
+            pad = int(0.5 * target_sr)
+            rs_ref = []
+            rs_deg = []
+            for i in range(len(ref_sig)):
+                rs_ref_i = transform(ref_sig[i])
+                rs_deg_i = transform(deg_sig[i])
+                if pad_with_silence:
+                    rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode=&#39;constant&#39;, value=0)
+                    rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode=&#39;constant&#39;, value=0)
+                rs_ref.append(rs_ref_i)
+                rs_deg.append(rs_deg_i)
+            ref_sig = torch.stack(rs_ref)
+            deg_sig = torch.stack(rs_deg)
+        # save audio chunks to tmp dir and create csv
+        tmp_dir = Path(tempfile.mkdtemp())
+        try:
+            tmp_input_csv_path = tmp_dir / &#34;input.csv&#34;
+            tmp_results_csv_path = tmp_dir / &#34;results.csv&#34;
+            tmp_debug_json_path = tmp_dir / &#34;debug.json&#34;
+            with open(tmp_input_csv_path, &#34;w&#34;) as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow([&#34;reference&#34;, &#34;degraded&#34;])
+                for i in range(len(ref_sig)):
+                    tmp_ref_filename = tmp_dir / f&#34;ref_{i}.wav&#34;
+                    tmp_deg_filename = tmp_dir / f&#34;deg_{i}.wav&#34;
+                    torchaudio.save(
+                        tmp_ref_filename,
+                        torch.clamp(ref_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding=&#34;PCM_S&#34;
+                    )
+                    torchaudio.save(
+                        tmp_deg_filename,
+                        torch.clamp(deg_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding=&#34;PCM_S&#34;
+                    )
+                    csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
+            return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
+        except Exception as e:
+            logger.error(&#34;Exception occurred when preparing files for ViSQOL: %s&#34;, e)
+            return tmp_dir, None, None, None
+
+    def _flush_files(self, tmp_dir: tp.Union[Path, str]):
+        # flush tmp files used to compute ViSQOL.
+        shutil.rmtree(str(tmp_dir))
+
+    def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -&gt; float:
+        # collect results for each evaluated pair and return averaged moslqo score.
+        with open(results_csv_path, &#34;r&#34;) as csv_file:
+            reader = csv.DictReader(csv_file)
+            moslqo_scores = [float(row[&#34;moslqo&#34;]) for row in reader]
+            if len(moslqo_scores) &gt; 0:
+                return sum(moslqo_scores) / len(moslqo_scores)
+            else:
+                return 0.0
+
+    def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -&gt; dict:
+        # collect debug data for the visqol inference.
+        with open(debug_json_path, &#34;r&#34;) as f:
+            data = json.load(f)
+            return data
+
+    @property
+    def visqol_model(self):
+        return f&#39;{self.visqol_bin}/model/{self.model}&#39;
+
+    def _run_visqol(
+        self,
+        input_csv_path: tp.Union[Path, str],
+        results_csv_path: tp.Union[Path, str],
+        debug_csv_path: tp.Optional[tp.Union[Path, str]],
+    ):
+        input_csv_path = str(input_csv_path)
+        results_csv_path = str(results_csv_path)
+        debug_csv_path = str(debug_csv_path)
+        cmd = [
+            f&#39;{self.visqol_bin}/bazel-bin/visqol&#39;,
+            &#39;--batch_input_csv&#39;, f&#39;{input_csv_path}&#39;,
+            &#39;--results_csv&#39;, f&#39;{results_csv_path}&#39;
+        ]
+        if debug_csv_path is not None:
+            cmd += [&#39;--output_debug&#39;, f&#39;{debug_csv_path}&#39;]
+        if self.visqol_mode == &#34;speech&#34;:
+            cmd += [&#39;--use_speech_mode&#39;]
+        cmd += [&#39;--similarity_to_quality_model&#39;, f&#39;{self.visqol_model}&#39;]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode:
+            logger.error(&#34;Error with visqol: \n %s \n %s&#34;, result.stdout.decode(), result.stderr.decode())
+            raise RuntimeError(&#34;Error while executing visqol&#34;)
+        result.check_returncode()
+
+    def __call__(
+        self,
+        ref_sig: torch.Tensor,
+        deg_sig: torch.Tensor,
+        sr: int,
+        pad_with_silence: bool = False,
+    ):
+        &#34;&#34;&#34;Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
+        Args:
+            ref_sig (torch.Tensor): Reference signals as [B, C, T].
+            deg_sig (torch.Tensor): Degraded signals as [B, C, T].
+            sr (int): Sample rate of the two audio signals.
+            pad_with_silence (bool): Whether to pad the file with silences as recommended
+                in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
+        Returns:
+            float: The ViSQOL score or mean score for the batch.
+        &#34;&#34;&#34;
+        logger.debug(f&#34;Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples&#34;)
+        tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
+            ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
+        )
+        try:
+            if input_csv and results_csv:
+                self._run_visqol(
+                    input_csv,
+                    results_csv,
+                    debug_json if self.debug else None,
+                )
+                mosqol = self._collect_moslqo_score(results_csv)
+                return mosqol
+            else:
+                raise RuntimeError(&#34;Something unexpected happened when running VISQOL!&#34;)
+        except Exception as e:
+            logger.error(&#34;Exception occurred when running ViSQOL: %s&#34;, e)
+        finally:
+            self._flush_files(tmp_dir)</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.metrics.visqol.ViSQOL.ALLOWED_SAMPLE_RATES"><code class="name">var <span class="ident">ALLOWED_SAMPLE_RATES</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.metrics.visqol.ViSQOL.SAMPLE_RATES_MODES"><code class="name">var <span class="ident">SAMPLE_RATES_MODES</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.metrics.visqol.ViSQOL.visqol_model"><code class="name">var <span class="ident">visqol_model</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def visqol_model(self):
+    return f&#39;{self.visqol_bin}/model/{self.model}&#39;</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.metrics" href="index.html">audiocraft.metrics</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.metrics.visqol.ViSQOL" href="#audiocraft.metrics.visqol.ViSQOL">ViSQOL</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.metrics.visqol.ViSQOL.ALLOWED_SAMPLE_RATES" href="#audiocraft.metrics.visqol.ViSQOL.ALLOWED_SAMPLE_RATES">ALLOWED_SAMPLE_RATES</a></code></li>
+<li><code><a title="audiocraft.metrics.visqol.ViSQOL.SAMPLE_RATES_MODES" href="#audiocraft.metrics.visqol.ViSQOL.SAMPLE_RATES_MODES">SAMPLE_RATES_MODES</a></code></li>
+<li><code><a title="audiocraft.metrics.visqol.ViSQOL.visqol_model" href="#audiocraft.metrics.visqol.ViSQOL.visqol_model">visqol_model</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/audiogen.html b/api_docs/audiocraft/models/audiogen.html
new file mode 100644
index 00000000..085d461d
--- /dev/null
+++ b/api_docs/audiocraft/models/audiogen.html
@@ -0,0 +1,852 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.audiogen API documentation</title>
+<meta name="description" content="Main model for using AudioGen. This will combine all the required components
+and provide easy access to the generation API." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.audiogen</code></h1>
+</header>
+<section id="section-intro">
+<p>Main model for using AudioGen. This will combine all the required components
+and provide easy access to the generation API.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Main model for using AudioGen. This will combine all the required components
+and provide easy access to the generation API.
+&#34;&#34;&#34;
+
+import typing as tp
+
+import torch
+
+from .encodec import CompressionModel
+from .lm import LMModel
+from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes
+from ..utils.autocast import TorchAutocast
+
+
+class AudioGen:
+    &#34;&#34;&#34;AudioGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        # Just to be safe, let&#39;s put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
+        if max_duration is None:
+            if hasattr(lm, &#39;cfg&#39;):
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError(&#34;You must provide max_duration when building directly AudioGen&#34;)
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=5)  # 5 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; float:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;facebook/audiogen-medium&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide a single model for now:
+        - facebook/audiogen-medium (1.5B), text to sound,
+          # see: https://huggingface.co/facebook/audiogen-medium
+        &#34;&#34;&#34;
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device, sample_rate=16000)
+            lm = get_debug_lm_model(device)
+            return AudioGen(name, compression_model, lm, max_duration=10)
+
+        compression_model = load_compression_model(name, device=device)
+        lm = load_lm_model(name, device=device)
+        assert &#39;self_wav&#39; not in lm.condition_provider.conditioners, \
+            &#34;AudioGen do not support waveform conditioning for now&#34;
+        return AudioGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 10.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 2):
+        &#34;&#34;&#34;Set the generation parameters for AudioGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 10.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 10 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (here text).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+
+        # generate audio
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.audiogen.AudioGen"><code class="flex name class">
+<span>class <span class="ident">AudioGen</span></span>
+<span>(</span><span>name: str, compression_model: <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, lm: <a title="audiocraft.models.lm.LMModel" href="lm.html#audiocraft.models.lm.LMModel">LMModel</a>, max_duration: Optional[float] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioGen main model with convenient generation API.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code></dt>
+<dd>name of the model.</dd>
+<dt><strong><code>compression_model</code></strong> :&ensp;<code>CompressionModel</code></dt>
+<dd>Compression model
+used to map audio to invertible discrete representations.</dd>
+<dt><strong><code>lm</code></strong> :&ensp;<code>LMModel</code></dt>
+<dd>Language model over discrete representations.</dd>
+<dt><strong><code>max_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>maximum duration the model can produce,
+otherwise, inferred from the training params.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioGen:
+    &#34;&#34;&#34;AudioGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        # Just to be safe, let&#39;s put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
+        if max_duration is None:
+            if hasattr(lm, &#39;cfg&#39;):
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError(&#34;You must provide max_duration when building directly AudioGen&#34;)
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=5)  # 5 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; float:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;facebook/audiogen-medium&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide a single model for now:
+        - facebook/audiogen-medium (1.5B), text to sound,
+          # see: https://huggingface.co/facebook/audiogen-medium
+        &#34;&#34;&#34;
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device, sample_rate=16000)
+            lm = get_debug_lm_model(device)
+            return AudioGen(name, compression_model, lm, max_duration=10)
+
+        compression_model = load_compression_model(name, device=device)
+        lm = load_lm_model(name, device=device)
+        assert &#39;self_wav&#39; not in lm.condition_provider.conditioners, \
+            &#34;AudioGen do not support waveform conditioning for now&#34;
+        return AudioGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 10.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 2):
+        &#34;&#34;&#34;Set the generation parameters for AudioGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 10.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 10 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (here text).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+
+        # generate audio
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.models.audiogen.AudioGen.get_pretrained"><code class="name flex">
+<span>def <span class="ident">get_pretrained</span></span>(<span>name: str = 'facebook/audiogen-medium', device=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return pretrained model, we provide a single model for now:
+- facebook/audiogen-medium (1.5B), text to sound,
+# see: <a href="https://huggingface.co/facebook/audiogen-medium">https://huggingface.co/facebook/audiogen-medium</a></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_pretrained(name: str = &#39;facebook/audiogen-medium&#39;, device=None):
+    &#34;&#34;&#34;Return pretrained model, we provide a single model for now:
+    - facebook/audiogen-medium (1.5B), text to sound,
+      # see: https://huggingface.co/facebook/audiogen-medium
+    &#34;&#34;&#34;
+    if device is None:
+        if torch.cuda.device_count():
+            device = &#39;cuda&#39;
+        else:
+            device = &#39;cpu&#39;
+
+    if name == &#39;debug&#39;:
+        # used only for unit tests
+        compression_model = get_debug_compression_model(device, sample_rate=16000)
+        lm = get_debug_lm_model(device)
+        return AudioGen(name, compression_model, lm, max_duration=10)
+
+    compression_model = load_compression_model(name, device=device)
+    lm = load_lm_model(name, device=device)
+    assert &#39;self_wav&#39; not in lm.condition_provider.conditioners, \
+        &#34;AudioGen do not support waveform conditioning for now&#34;
+    return AudioGen(name, compression_model, lm)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.audiogen.AudioGen.audio_channels"><code class="name">var <span class="ident">audio_channels</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Audio channels of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def audio_channels(self) -&gt; int:
+    &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.channels</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.audiogen.AudioGen.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"><p>Roughly the number of AR steps per seconds.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; float:
+    &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+    return self.compression_model.frame_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.audiogen.AudioGen.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Sample rate of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.sample_rate</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.audiogen.AudioGen.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, descriptions: List[str], progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on text.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>A list of strings used as text conditioning.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples conditioned on text.
+
+    Args:
+        descriptions (list of str): A list of strings used as text conditioning.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+    assert prompt_tokens is None
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.audiogen.AudioGen.generate_continuation"><code class="name flex">
+<span>def <span class="ident">generate_continuation</span></span>(<span>self, prompt: torch.Tensor, prompt_sample_rate: int, descriptions: Optional[List[Optional[str]]] = None, progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on audio prompts.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>prompt</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>A batch of waveforms used for continuation.
+Prompt should be [B, C, T], or [C, T] if only one sample is generated.</dd>
+<dt><strong><code>prompt_sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sampling rate of the given audio waveforms.</dd>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>list</code> of <code>str</code>, optional</dt>
+<dd>A list of strings used as text conditioning. Defaults to None.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                          descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                          progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+    Args:
+        prompt (torch.Tensor): A batch of waveforms used for continuation.
+            Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+        prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+        descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    if prompt.dim() == 2:
+        prompt = prompt[None]
+    if prompt.dim() != 3:
+        raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+    prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+    if descriptions is None:
+        descriptions = [None] * len(prompt)
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+    assert prompt_tokens is not None
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.audiogen.AudioGen.set_custom_progress_callback"><code class="name flex">
+<span>def <span class="ident">set_custom_progress_callback</span></span>(<span>self, progress_callback: Optional[Callable[[int, int], None]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Override the default progress callback.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+    &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+    self._progress_callback = progress_callback</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.audiogen.AudioGen.set_generation_params"><code class="name flex">
+<span>def <span class="ident">set_generation_params</span></span>(<span>self, use_sampling: bool = True, top_k: int = 250, top_p: float = 0.0, temperature: float = 1.0, duration: float = 10.0, cfg_coef: float = 3.0, two_step_cfg: bool = False, extend_stride: float = 2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the generation parameters for AudioGen.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>use_sampling</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Use sampling if True, else do argmax decoding. Defaults to True.</dd>
+<dt><strong><code>top_k</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>top_k used for sampling. Defaults to 250.</dd>
+<dt><strong><code>top_p</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.</dd>
+<dt><strong><code>temperature</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Softmax temperature parameter. Defaults to 1.0.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Duration of the generated waveform. Defaults to 10.0.</dd>
+<dt><strong><code>cfg_coef</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Coefficient used for classifier free guidance. Defaults to 3.0.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>If True, performs 2 forward for Classifier Free Guidance,
+instead of batching together the two. This has some impact on how things
+are padded but seems to have little impact in practice.</dd>
+<dt><strong><code>extend_stride</code></strong></dt>
+<dd>when doing extended generation (i.e. more than 10 seconds), by how much
+should we extend the audio each time. Larger values will mean less context is
+preserved, and shorter value will require extra computations.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                          top_p: float = 0.0, temperature: float = 1.0,
+                          duration: float = 10.0, cfg_coef: float = 3.0,
+                          two_step_cfg: bool = False, extend_stride: float = 2):
+    &#34;&#34;&#34;Set the generation parameters for AudioGen.
+
+    Args:
+        use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+        top_k (int, optional): top_k used for sampling. Defaults to 250.
+        top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+        temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+        duration (float, optional): Duration of the generated waveform. Defaults to 10.0.
+        cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+        two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+            instead of batching together the two. This has some impact on how things
+            are padded but seems to have little impact in practice.
+        extend_stride: when doing extended generation (i.e. more than 10 seconds), by how much
+            should we extend the audio each time. Larger values will mean less context is
+            preserved, and shorter value will require extra computations.
+    &#34;&#34;&#34;
+    assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+    self.extend_stride = extend_stride
+    self.duration = duration
+    self.generation_params = {
+        &#39;use_sampling&#39;: use_sampling,
+        &#39;temp&#39;: temperature,
+        &#39;top_k&#39;: top_k,
+        &#39;top_p&#39;: top_p,
+        &#39;cfg_coef&#39;: cfg_coef,
+        &#39;two_step_cfg&#39;: two_step_cfg,
+    }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.audiogen.AudioGen" href="#audiocraft.models.audiogen.AudioGen">AudioGen</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.audiogen.AudioGen.audio_channels" href="#audiocraft.models.audiogen.AudioGen.audio_channels">audio_channels</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.frame_rate" href="#audiocraft.models.audiogen.AudioGen.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.generate" href="#audiocraft.models.audiogen.AudioGen.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.generate_continuation" href="#audiocraft.models.audiogen.AudioGen.generate_continuation">generate_continuation</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.get_pretrained" href="#audiocraft.models.audiogen.AudioGen.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.sample_rate" href="#audiocraft.models.audiogen.AudioGen.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.set_custom_progress_callback" href="#audiocraft.models.audiogen.AudioGen.set_custom_progress_callback">set_custom_progress_callback</a></code></li>
+<li><code><a title="audiocraft.models.audiogen.AudioGen.set_generation_params" href="#audiocraft.models.audiogen.AudioGen.set_generation_params">set_generation_params</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/builders.html b/api_docs/audiocraft/models/builders.html
new file mode 100644
index 00000000..ec4e6f8e
--- /dev/null
+++ b/api_docs/audiocraft/models/builders.html
@@ -0,0 +1,664 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.builders API documentation</title>
+<meta name="description" content="All the functions to build the relevant models and modules
+from the Hydra config." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.builders</code></h1>
+</header>
+<section id="section-intro">
+<p>All the functions to build the relevant models and modules
+from the Hydra config.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+All the functions to build the relevant models and modules
+from the Hydra config.
+&#34;&#34;&#34;
+
+import typing as tp
+
+import audiocraft
+import omegaconf
+import torch
+
+from .encodec import CompressionModel, EncodecModel, InterleaveStereoCompressionModel
+from .lm import LMModel
+from ..modules.codebooks_patterns import (
+    CodebooksPatternProvider,
+    DelayedPatternProvider,
+    MusicLMPattern,
+    ParallelPatternProvider,
+    UnrolledPatternProvider,
+    CoarseFirstPattern,
+)
+from ..modules.conditioners import (
+    BaseConditioner,
+    ChromaStemConditioner,
+    CLAPEmbeddingConditioner,
+    ConditionFuser,
+    ConditioningProvider,
+    LUTConditioner,
+    T5Conditioner,
+)
+from .unet import DiffusionUnet
+from .. import quantization as qt
+from ..utils.utils import dict_from_config
+from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
+
+
+def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -&gt; qt.BaseQuantizer:
+    klass = {
+        &#39;no_quant&#39;: qt.DummyQuantizer,
+        &#39;rvq&#39;: qt.ResidualVectorQuantizer
+    }[quantizer]
+    kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != &#39;no_quant&#39;:
+        kwargs[&#39;dimension&#39;] = dimension
+    return klass(**kwargs)
+
+
+def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == &#39;seanet&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;seanet&#39;))
+        encoder_override_kwargs = kwargs.pop(&#39;encoder&#39;)
+        decoder_override_kwargs = kwargs.pop(&#39;decoder&#39;)
+        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
+        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
+        return encoder, decoder
+    else:
+        raise KeyError(f&#34;Unexpected compression model {cfg.compression_model}&#34;)
+
+
+def get_compression_model(cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    &#34;&#34;&#34;Instantiate a compression model.&#34;&#34;&#34;
+    if cfg.compression_model == &#39;encodec&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;encodec&#39;))
+        encoder_name = kwargs.pop(&#39;autoencoder&#39;)
+        quantizer_name = kwargs.pop(&#39;quantizer&#39;)
+        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs[&#39;sample_rate&#39;] // encoder.hop_length
+        renormalize = kwargs.pop(&#39;renormalize&#39;, False)
+        # deprecated params
+        kwargs.pop(&#39;renorm&#39;, None)
+        return EncodecModel(encoder, decoder, quantizer,
+                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
+    else:
+        raise KeyError(f&#34;Unexpected compression model {cfg.compression_model}&#34;)
+
+
+def get_lm_model(cfg: omegaconf.DictConfig) -&gt; LMModel:
+    &#34;&#34;&#34;Instantiate a transformer LM.&#34;&#34;&#34;
+    if cfg.lm_model == &#39;transformer_lm&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;transformer_lm&#39;))
+        n_q = kwargs[&#39;n_q&#39;]
+        q_modeling = kwargs.pop(&#39;q_modeling&#39;, None)
+        codebooks_pattern_cfg = getattr(cfg, &#39;codebooks_pattern&#39;)
+        attribute_dropout = dict_from_config(getattr(cfg, &#39;attribute_dropout&#39;))
+        cls_free_guidance = dict_from_config(getattr(cfg, &#39;classifier_free_guidance&#39;))
+        cfg_prob, cfg_coef = cls_free_guidance[&#39;training_dropout&#39;], cls_free_guidance[&#39;inference_coef&#39;]
+        fuser = get_condition_fuser(cfg)
+        condition_provider = get_conditioner_provider(kwargs[&#34;dim&#34;], cfg).to(cfg.device)
+        if len(fuser.fuse2cond[&#39;cross&#39;]) &gt; 0:  # enforce cross-att programmatically
+            kwargs[&#39;cross_attention&#39;] = True
+        if codebooks_pattern_cfg.modeling is None:
+            assert q_modeling is not None, \
+                &#34;LM model should either have a codebook pattern defined or transformer_lm.q_modeling&#34;
+            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {&#39;modeling&#39;: q_modeling, &#39;delay&#39;: {&#39;delays&#39;: list(range(n_q))}}
+            )
+        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        return LMModel(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            dtype=getattr(torch, cfg.dtype),
+            device=cfg.device,
+            **kwargs
+        ).to(cfg.device)
+    else:
+        raise KeyError(f&#34;Unexpected LM model {cfg.lm_model}&#34;)
+
+
+def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -&gt; ConditioningProvider:
+    &#34;&#34;&#34;Instantiate a conditioning model.&#34;&#34;&#34;
+    device = cfg.device
+    duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, &#39;conditioners&#39;)
+    dict_cfg = {} if cfg is None else dict_from_config(cfg)
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    condition_provider_args = dict_cfg.pop(&#39;args&#39;, {})
+    condition_provider_args.pop(&#39;merge_text_conditions_p&#39;, None)
+    condition_provider_args.pop(&#39;drop_desc_p&#39;, None)
+
+    for cond, cond_cfg in dict_cfg.items():
+        model_type = cond_cfg[&#39;model&#39;]
+        model_args = cond_cfg[model_type]
+        if model_type == &#39;t5&#39;:
+            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
+        elif model_type == &#39;lut&#39;:
+            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
+        elif model_type == &#39;chroma_stem&#39;:
+            conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim,
+                duration=duration,
+                device=device,
+                **model_args
+            )
+        elif model_type == &#39;clap&#39;:
+            conditioners[str(cond)] = CLAPEmbeddingConditioner(
+                output_dim=output_dim,
+                device=device,
+                **model_args
+            )
+        else:
+            raise ValueError(f&#34;Unrecognized conditioning model: {model_type}&#34;)
+    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
+    return conditioner
+
+
+def get_condition_fuser(cfg: omegaconf.DictConfig) -&gt; ConditionFuser:
+    &#34;&#34;&#34;Instantiate a condition fuser object.&#34;&#34;&#34;
+    fuser_cfg = getattr(cfg, &#39;fuser&#39;)
+    fuser_methods = [&#39;sum&#39;, &#39;cross&#39;, &#39;prepend&#39;, &#39;input_interpolate&#39;]
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser
+
+
+def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -&gt; CodebooksPatternProvider:
+    &#34;&#34;&#34;Instantiate a codebooks pattern provider object.&#34;&#34;&#34;
+    pattern_providers = {
+        &#39;parallel&#39;: ParallelPatternProvider,
+        &#39;delay&#39;: DelayedPatternProvider,
+        &#39;unroll&#39;: UnrolledPatternProvider,
+        &#39;coarse_first&#39;: CoarseFirstPattern,
+        &#39;musiclm&#39;: MusicLMPattern,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(n_q, **kwargs)
+
+
+def get_debug_compression_model(device=&#39;cpu&#39;, sample_rate: int = 32000):
+    &#34;&#34;&#34;Instantiate a debug compression model to be used for unit tests.&#34;&#34;&#34;
+    assert sample_rate in [16000, 32000], &#34;unsupported sample rate for debug compression model&#34;
+    model_ratios = {
+        16000: [10, 8, 8],  # 25 Hz at 16kHz
+        32000: [10, 8, 16]  # 25 Hz at 32kHz
+    }
+    ratios: tp.List[int] = model_ratios[sample_rate]
+    frame_rate = 25
+    seanet_kwargs: dict = {
+        &#39;n_filters&#39;: 4,
+        &#39;n_residual_layers&#39;: 1,
+        &#39;dimension&#39;: 32,
+        &#39;ratios&#39;: ratios,
+    }
+    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
+    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
+    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
+    init_x = torch.randn(8, 32, 128)
+    quantizer(init_x, 1)  # initialize kmeans etc.
+    compression_model = EncodecModel(
+        encoder, decoder, quantizer,
+        frame_rate=frame_rate, sample_rate=sample_rate, channels=1).to(device)
+    return compression_model.eval()
+
+
+def get_diffusion_model(cfg: omegaconf.DictConfig):
+    # TODO Find a way to infer the channels from dset
+    channels = cfg.channels
+    num_steps = cfg.schedule.num_steps
+    return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
+
+
+def get_processor(cfg, sample_rate: int = 24000):
+    sample_processor = SampleProcessor()
+    if cfg.use:
+        kw = dict(cfg)
+        kw.pop(&#39;use&#39;)
+        kw.pop(&#39;name&#39;)
+        if cfg.name == &#34;multi_band_processor&#34;:
+            sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
+    return sample_processor
+
+
+def get_debug_lm_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug LM to be used for unit tests.&#34;&#34;&#34;
+    pattern = DelayedPatternProvider(n_q=4)
+    dim = 16
+    providers = {
+        &#39;description&#39;: LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer=&#34;whitespace&#34;),
+    }
+    condition_provider = ConditioningProvider(providers)
+    fuser = ConditionFuser(
+        {&#39;cross&#39;: [&#39;description&#39;], &#39;prepend&#39;: [],
+         &#39;sum&#39;: [], &#39;input_interpolate&#39;: []})
+    lm = LMModel(
+        pattern, condition_provider, fuser,
+        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
+        cross_attention=True, causal=True)
+    return lm.to(device).eval()
+
+
+def get_wrapped_compression_model(
+        compression_model: CompressionModel,
+        cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    if hasattr(cfg, &#39;interleave_stereo_codebooks&#39;):
+        if cfg.interleave_stereo_codebooks.use:
+            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
+            kwargs.pop(&#39;use&#39;)
+            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
+    if hasattr(cfg, &#39;compression_model_n_q&#39;):
+        if cfg.compression_model_n_q is not None:
+            compression_model.set_num_codebooks(cfg.compression_model_n_q)
+    return compression_model</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.builders.get_codebooks_pattern_provider"><code class="name flex">
+<span>def <span class="ident">get_codebooks_pattern_provider</span></span>(<span>n_q: int, cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="../modules/codebooks_patterns.html#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a codebooks pattern provider object.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -&gt; CodebooksPatternProvider:
+    &#34;&#34;&#34;Instantiate a codebooks pattern provider object.&#34;&#34;&#34;
+    pattern_providers = {
+        &#39;parallel&#39;: ParallelPatternProvider,
+        &#39;delay&#39;: DelayedPatternProvider,
+        &#39;unroll&#39;: UnrolledPatternProvider,
+        &#39;coarse_first&#39;: CoarseFirstPattern,
+        &#39;musiclm&#39;: MusicLMPattern,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(n_q, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_compression_model"><code class="name flex">
+<span>def <span class="ident">get_compression_model</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a compression model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_compression_model(cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    &#34;&#34;&#34;Instantiate a compression model.&#34;&#34;&#34;
+    if cfg.compression_model == &#39;encodec&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;encodec&#39;))
+        encoder_name = kwargs.pop(&#39;autoencoder&#39;)
+        quantizer_name = kwargs.pop(&#39;quantizer&#39;)
+        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs[&#39;sample_rate&#39;] // encoder.hop_length
+        renormalize = kwargs.pop(&#39;renormalize&#39;, False)
+        # deprecated params
+        kwargs.pop(&#39;renorm&#39;, None)
+        return EncodecModel(encoder, decoder, quantizer,
+                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
+    else:
+        raise KeyError(f&#34;Unexpected compression model {cfg.compression_model}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_condition_fuser"><code class="name flex">
+<span>def <span class="ident">get_condition_fuser</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.conditioners.ConditionFuser" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a condition fuser object.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_condition_fuser(cfg: omegaconf.DictConfig) -&gt; ConditionFuser:
+    &#34;&#34;&#34;Instantiate a condition fuser object.&#34;&#34;&#34;
+    fuser_cfg = getattr(cfg, &#39;fuser&#39;)
+    fuser_methods = [&#39;sum&#39;, &#39;cross&#39;, &#39;prepend&#39;, &#39;input_interpolate&#39;]
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_conditioner_provider"><code class="name flex">
+<span>def <span class="ident">get_conditioner_provider</span></span>(<span>output_dim: int, cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.conditioners.ConditioningProvider" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a conditioning model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -&gt; ConditioningProvider:
+    &#34;&#34;&#34;Instantiate a conditioning model.&#34;&#34;&#34;
+    device = cfg.device
+    duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, &#39;conditioners&#39;)
+    dict_cfg = {} if cfg is None else dict_from_config(cfg)
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    condition_provider_args = dict_cfg.pop(&#39;args&#39;, {})
+    condition_provider_args.pop(&#39;merge_text_conditions_p&#39;, None)
+    condition_provider_args.pop(&#39;drop_desc_p&#39;, None)
+
+    for cond, cond_cfg in dict_cfg.items():
+        model_type = cond_cfg[&#39;model&#39;]
+        model_args = cond_cfg[model_type]
+        if model_type == &#39;t5&#39;:
+            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
+        elif model_type == &#39;lut&#39;:
+            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
+        elif model_type == &#39;chroma_stem&#39;:
+            conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim,
+                duration=duration,
+                device=device,
+                **model_args
+            )
+        elif model_type == &#39;clap&#39;:
+            conditioners[str(cond)] = CLAPEmbeddingConditioner(
+                output_dim=output_dim,
+                device=device,
+                **model_args
+            )
+        else:
+            raise ValueError(f&#34;Unrecognized conditioning model: {model_type}&#34;)
+    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
+    return conditioner</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_debug_compression_model"><code class="name flex">
+<span>def <span class="ident">get_debug_compression_model</span></span>(<span>device='cpu', sample_rate: int = 32000)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a debug compression model to be used for unit tests.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_debug_compression_model(device=&#39;cpu&#39;, sample_rate: int = 32000):
+    &#34;&#34;&#34;Instantiate a debug compression model to be used for unit tests.&#34;&#34;&#34;
+    assert sample_rate in [16000, 32000], &#34;unsupported sample rate for debug compression model&#34;
+    model_ratios = {
+        16000: [10, 8, 8],  # 25 Hz at 16kHz
+        32000: [10, 8, 16]  # 25 Hz at 32kHz
+    }
+    ratios: tp.List[int] = model_ratios[sample_rate]
+    frame_rate = 25
+    seanet_kwargs: dict = {
+        &#39;n_filters&#39;: 4,
+        &#39;n_residual_layers&#39;: 1,
+        &#39;dimension&#39;: 32,
+        &#39;ratios&#39;: ratios,
+    }
+    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
+    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
+    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
+    init_x = torch.randn(8, 32, 128)
+    quantizer(init_x, 1)  # initialize kmeans etc.
+    compression_model = EncodecModel(
+        encoder, decoder, quantizer,
+        frame_rate=frame_rate, sample_rate=sample_rate, channels=1).to(device)
+    return compression_model.eval()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_debug_lm_model"><code class="name flex">
+<span>def <span class="ident">get_debug_lm_model</span></span>(<span>device='cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a debug LM to be used for unit tests.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_debug_lm_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug LM to be used for unit tests.&#34;&#34;&#34;
+    pattern = DelayedPatternProvider(n_q=4)
+    dim = 16
+    providers = {
+        &#39;description&#39;: LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer=&#34;whitespace&#34;),
+    }
+    condition_provider = ConditioningProvider(providers)
+    fuser = ConditionFuser(
+        {&#39;cross&#39;: [&#39;description&#39;], &#39;prepend&#39;: [],
+         &#39;sum&#39;: [], &#39;input_interpolate&#39;: []})
+    lm = LMModel(
+        pattern, condition_provider, fuser,
+        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
+        cross_attention=True, causal=True)
+    return lm.to(device).eval()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_diffusion_model"><code class="name flex">
+<span>def <span class="ident">get_diffusion_model</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_diffusion_model(cfg: omegaconf.DictConfig):
+    # TODO Find a way to infer the channels from dset
+    channels = cfg.channels
+    num_steps = cfg.schedule.num_steps
+    return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_encodec_autoencoder"><code class="name flex">
+<span>def <span class="ident">get_encodec_autoencoder</span></span>(<span>encoder_name: str, cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == &#39;seanet&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;seanet&#39;))
+        encoder_override_kwargs = kwargs.pop(&#39;encoder&#39;)
+        decoder_override_kwargs = kwargs.pop(&#39;decoder&#39;)
+        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
+        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
+        return encoder, decoder
+    else:
+        raise KeyError(f&#34;Unexpected compression model {cfg.compression_model}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_lm_model"><code class="name flex">
+<span>def <span class="ident">get_lm_model</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.models.lm.LMModel" href="lm.html#audiocraft.models.lm.LMModel">LMModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a transformer LM.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lm_model(cfg: omegaconf.DictConfig) -&gt; LMModel:
+    &#34;&#34;&#34;Instantiate a transformer LM.&#34;&#34;&#34;
+    if cfg.lm_model == &#39;transformer_lm&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;transformer_lm&#39;))
+        n_q = kwargs[&#39;n_q&#39;]
+        q_modeling = kwargs.pop(&#39;q_modeling&#39;, None)
+        codebooks_pattern_cfg = getattr(cfg, &#39;codebooks_pattern&#39;)
+        attribute_dropout = dict_from_config(getattr(cfg, &#39;attribute_dropout&#39;))
+        cls_free_guidance = dict_from_config(getattr(cfg, &#39;classifier_free_guidance&#39;))
+        cfg_prob, cfg_coef = cls_free_guidance[&#39;training_dropout&#39;], cls_free_guidance[&#39;inference_coef&#39;]
+        fuser = get_condition_fuser(cfg)
+        condition_provider = get_conditioner_provider(kwargs[&#34;dim&#34;], cfg).to(cfg.device)
+        if len(fuser.fuse2cond[&#39;cross&#39;]) &gt; 0:  # enforce cross-att programmatically
+            kwargs[&#39;cross_attention&#39;] = True
+        if codebooks_pattern_cfg.modeling is None:
+            assert q_modeling is not None, \
+                &#34;LM model should either have a codebook pattern defined or transformer_lm.q_modeling&#34;
+            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {&#39;modeling&#39;: q_modeling, &#39;delay&#39;: {&#39;delays&#39;: list(range(n_q))}}
+            )
+        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        return LMModel(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            dtype=getattr(torch, cfg.dtype),
+            device=cfg.device,
+            **kwargs
+        ).to(cfg.device)
+    else:
+        raise KeyError(f&#34;Unexpected LM model {cfg.lm_model}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_processor"><code class="name flex">
+<span>def <span class="ident">get_processor</span></span>(<span>cfg, sample_rate: int = 24000)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_processor(cfg, sample_rate: int = 24000):
+    sample_processor = SampleProcessor()
+    if cfg.use:
+        kw = dict(cfg)
+        kw.pop(&#39;use&#39;)
+        kw.pop(&#39;name&#39;)
+        if cfg.name == &#34;multi_band_processor&#34;:
+            sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
+    return sample_processor</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_quantizer"><code class="name flex">
+<span>def <span class="ident">get_quantizer</span></span>(<span>quantizer: str, cfg: omegaconf.dictconfig.DictConfig, dimension: int) ‑> <a title="audiocraft.quantization.base.BaseQuantizer" href="../quantization/base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -&gt; qt.BaseQuantizer:
+    klass = {
+        &#39;no_quant&#39;: qt.DummyQuantizer,
+        &#39;rvq&#39;: qt.ResidualVectorQuantizer
+    }[quantizer]
+    kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != &#39;no_quant&#39;:
+        kwargs[&#39;dimension&#39;] = dimension
+    return klass(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_wrapped_compression_model"><code class="name flex">
+<span>def <span class="ident">get_wrapped_compression_model</span></span>(<span>compression_model: <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_wrapped_compression_model(
+        compression_model: CompressionModel,
+        cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    if hasattr(cfg, &#39;interleave_stereo_codebooks&#39;):
+        if cfg.interleave_stereo_codebooks.use:
+            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
+            kwargs.pop(&#39;use&#39;)
+            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
+    if hasattr(cfg, &#39;compression_model_n_q&#39;):
+        if cfg.compression_model_n_q is not None:
+            compression_model.set_num_codebooks(cfg.compression_model_n_q)
+    return compression_model</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.builders.get_codebooks_pattern_provider" href="#audiocraft.models.builders.get_codebooks_pattern_provider">get_codebooks_pattern_provider</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_compression_model" href="#audiocraft.models.builders.get_compression_model">get_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_condition_fuser" href="#audiocraft.models.builders.get_condition_fuser">get_condition_fuser</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_conditioner_provider" href="#audiocraft.models.builders.get_conditioner_provider">get_conditioner_provider</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_debug_compression_model" href="#audiocraft.models.builders.get_debug_compression_model">get_debug_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_debug_lm_model" href="#audiocraft.models.builders.get_debug_lm_model">get_debug_lm_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_diffusion_model" href="#audiocraft.models.builders.get_diffusion_model">get_diffusion_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_encodec_autoencoder" href="#audiocraft.models.builders.get_encodec_autoencoder">get_encodec_autoencoder</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_lm_model" href="#audiocraft.models.builders.get_lm_model">get_lm_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_processor" href="#audiocraft.models.builders.get_processor">get_processor</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_quantizer" href="#audiocraft.models.builders.get_quantizer">get_quantizer</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_wrapped_compression_model" href="#audiocraft.models.builders.get_wrapped_compression_model">get_wrapped_compression_model</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/encodec.html b/api_docs/audiocraft/models/encodec.html
new file mode 100644
index 00000000..6d8d34fa
--- /dev/null
+++ b/api_docs/audiocraft/models/encodec.html
@@ -0,0 +1,2064 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.encodec API documentation</title>
+<meta name="description" content="Compression models or wrapper around existing models.
+Also defines the main interface that a model must follow to be usable as an audio tokenizer." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.encodec</code></h1>
+</header>
+<section id="section-intro">
+<p>Compression models or wrapper around existing models.
+Also defines the main interface that a model must follow to be usable as an audio tokenizer.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Compression models or wrapper around existing models.
+Also defines the main interface that a model must follow to be usable as an audio tokenizer.
+&#34;&#34;&#34;
+
+from abc import ABC, abstractmethod
+import logging
+import math
+from pathlib import Path
+import typing as tp
+
+from einops import rearrange
+import numpy as np
+import torch
+from torch import nn
+from transformers import EncodecModel as HFEncodecModel
+
+from .. import quantization as qt
+
+
+logger = logging.getLogger()
+
+
+class CompressionModel(ABC, nn.Module):
+    &#34;&#34;&#34;Base API for all compression model that aim at being used as audio tokenizers
+    with a language model.
+    &#34;&#34;&#34;
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        ...
+
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;See `EncodecModel.encode`.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;See `EncodecModel.decode`.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        ...
+
+    @property
+    @abstractmethod
+    def channels(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def frame_rate(self) -&gt; float:
+        ...
+
+    @property
+    @abstractmethod
+    def sample_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def cardinality(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def num_codebooks(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def total_codebooks(self) -&gt; int:
+        ...
+
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        ...
+
+    @staticmethod
+    def get_pretrained(
+            name: str, device: tp.Union[torch.device, str] = &#39;cpu&#39;
+            ) -&gt; &#39;CompressionModel&#39;:
+        &#34;&#34;&#34;Instantiate a CompressionModel from a given pretrained model.
+
+        Args:
+            name (Path or str): name of the pretrained model. See after.
+            device (torch.device or str): Device on which the model is loaded.
+
+        Pretrained models:
+            - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
+            - dac_24khz (same)
+            - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
+            - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
+            - your own model on HugginFace. Export instructions to come...
+        &#34;&#34;&#34;
+
+        from . import builders, loaders
+        model: CompressionModel
+        if name in [&#39;dac_44khz&#39;, &#39;dac_24khz&#39;]:
+            model_type = name.split(&#39;_&#39;)[1]
+            logger.info(&#34;Getting pretrained compression model from DAC %s&#34;, model_type)
+            model = DAC(model_type)
+        elif name in [&#39;debug_compression_model&#39;]:
+            logger.info(&#34;Getting pretrained compression model for debug&#34;)
+            model = builders.get_debug_compression_model()
+        elif Path(name).exists():
+            # We assume here if the paths exist that it is in fact an AC checkpoint
+            # that was exported using `audiocraft.utils.export` functions.
+            model = loaders.load_compression_model(name, device=device)
+        else:
+            logger.info(&#34;Getting pretrained compression model from HF %s&#34;, name)
+            hf_model = HFEncodecModel.from_pretrained(name)
+            model = HFEncodecCompressionModel(hf_model).to(device)
+        return model.to(device).eval()
+
+
+class EncodecModel(CompressionModel):
+    &#34;&#34;&#34;Encodec model operating on the raw waveform.
+
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (int): Frame rate for the latent representation.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        renormalize (bool): Whether to renormalize the audio before running the model.
+    &#34;&#34;&#34;
+    # we need assignment to override the property in the abstract class,
+    # I couldn&#39;t find a better way...
+    frame_rate: float = 0
+    sample_rate: int = 0
+    channels: int = 0
+
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 quantizer: qt.BaseQuantizer,
+                 frame_rate: int,
+                 sample_rate: int,
+                 channels: int,
+                 causal: bool = False,
+                 renormalize: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.renormalize = renormalize
+        self.causal = causal
+        if self.causal:
+            # we force disabling here to avoid handling linear overlap of segments
+            # as supported in original EnCodec codebase.
+            assert not self.renormalize, &#39;Causal model does not support renormalize&#39;
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of quantizer codebooks available.&#34;&#34;&#34;
+        return self.quantizer.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        return self.quantizer.num_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        self.quantizer.set_num_codebooks(n)
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.&#34;&#34;&#34;
+        return self.quantizer.bins
+
+    def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        scale: tp.Optional[torch.Tensor]
+        if self.renormalize:
+            mono = x.mean(dim=1, keepdim=True)
+            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+            scale = 1e-8 + volume
+            x = x / scale
+            scale = scale.view(-1, 1)
+        else:
+            scale = None
+        return x, scale
+
+    def postprocess(self,
+                    x: torch.Tensor,
+                    scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+        if scale is not None:
+            assert self.renormalize
+            x = x * scale.view(-1, 1, 1)
+        return x
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        x, scale = self.preprocess(x)
+
+        emb = self.encoder(x)
+        q_res = self.quantizer(emb, self.frame_rate)
+        out = self.decoder(q_res.x)
+
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] &gt;= length, (out.shape[-1], length)
+        out = out[..., :length]
+
+        q_res.x = self.postprocess(out, scale)
+
+        return q_res
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+
+        Returns:
+            codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
+                codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+                scale a float tensor containing the scale for audio renormalizealization.
+        &#34;&#34;&#34;
+        assert x.dim() == 3
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        codes = self.quantizer.encode(emb)
+        return codes, scale
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+        audio denormalization if needed.
+
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+            scale (torch.Tensor, optional): Float tensor containing the scale value.
+
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        &#34;&#34;&#34;
+        emb = self.decode_latent(codes)
+        out = self.decoder(emb)
+        out = self.postprocess(out, scale)
+        # out contains extra padding added by the encoder and decoder
+        return out
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.quantizer.decode(codes)
+
+
+class DAC(CompressionModel):
+    def __init__(self, model_type: str = &#34;44khz&#34;):
+        super().__init__()
+        try:
+            import dac.utils
+        except ImportError:
+            raise RuntimeError(&#34;Could not import dac, make sure it is installed, &#34;
+                               &#34;please run `pip install descript-audio-codec`&#34;)
+        self.model = dac.utils.load_model(model_type=model_type)
+        self.n_quantizers = self.total_codebooks
+        self.model.eval()
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        # We don&#39;t support training with this.
+        raise NotImplementedError(&#34;Forward and training with DAC not supported.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        codes = self.model.encode(x, self.n_quantizers)[1]
+        return codes[:, :self.n_quantizers], None
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        assert scale is None
+        z_q = self.decode_latent(codes)
+        return self.model.decode(z_q)
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.model.quantizer.from_codes(codes)[0]
+
+    @property
+    def channels(self) -&gt; int:
+        return 1
+
+    @property
+    def frame_rate(self) -&gt; float:
+        return self.model.sample_rate / self.model.hop_length
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def cardinality(self) -&gt; int:
+        return self.model.codebook_size
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_quantizers
+
+    @property
+    def total_codebooks(self) -&gt; int:
+        return self.model.n_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        assert n &gt;= 1
+        assert n &lt;= self.total_codebooks
+        self.n_quantizers = n
+
+
+class HFEncodecCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wrapper around HuggingFace Encodec.
+    &#34;&#34;&#34;
+    def __init__(self, model: HFEncodecModel):
+        super().__init__()
+        self.model = model
+        bws = self.model.config.target_bandwidths
+        num_codebooks = [
+            bw * 1000 / (self.frame_rate * math.log2(self.cardinality))
+            for bw in bws
+        ]
+        deltas = [nc - int(nc) for nc in num_codebooks]
+        # Checking we didn&#39;t do some bad maths and we indeed have integers!
+        assert all(deltas) &lt;= 1e-3, deltas
+        self.possible_num_codebooks = [int(nc) for nc in num_codebooks]
+        self.set_num_codebooks(max(self.possible_num_codebooks))
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        # We don&#39;t support training with this.
+        raise NotImplementedError(&#34;Forward and training with HF EncodecModel not supported.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        bandwidth_index = self.possible_num_codebooks.index(self.num_codebooks)
+        bandwidth = self.model.config.target_bandwidths[bandwidth_index]
+        res = self.model.encode(x, None, bandwidth)
+        assert len(res[0]) == 1
+        assert len(res[1]) == 1
+        return res[0][0], res[1][0]
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        if scale is None:
+            scales = [None]  # type: ignore
+        else:
+            scales = scale  # type: ignore
+        res = self.model.decode(codes[None], scales)
+        return res[0]
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.model.quantizer.decode(codes.transpose(0, 1))
+
+    @property
+    def channels(self) -&gt; int:
+        return self.model.config.audio_channels
+
+    @property
+    def frame_rate(self) -&gt; float:
+        hop_length = int(np.prod(self.model.config.upsampling_ratios))
+        return self.sample_rate / hop_length
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.config.sampling_rate
+
+    @property
+    def cardinality(self) -&gt; int:
+        return self.model.config.codebook_size
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self._num_codebooks
+
+    @property
+    def total_codebooks(self) -&gt; int:
+        return max(self.possible_num_codebooks)
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        if n not in self.possible_num_codebooks:
+            raise ValueError(f&#34;Allowed values for num codebooks: {self.possible_num_codebooks}&#34;)
+        self._num_codebooks = n
+
+
+class InterleaveStereoCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wraps a CompressionModel to support stereo inputs. The wrapped model
+    will be applied independently to the left and right channels, and both codebooks
+    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
+    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
+    `per_timestep`.
+
+    Args:
+        model (CompressionModel): Compression model to wrap.
+        per_timestep (bool): Whether to interleave on the timestep dimension
+            or on the codebooks dimension.
+    &#34;&#34;&#34;
+    def __init__(self, model: CompressionModel, per_timestep: bool = False):
+        super().__init__()
+        self.model = model
+        self.per_timestep = per_timestep
+        assert self.model.channels == 1, &#34;Wrapped model is expected to be for monophonic audio&#34;
+
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+        ..Warning:: this reports the number of codebooks after the interleaving
+        of the codebooks!
+        &#34;&#34;&#34;
+        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+        ..Warning:: this sets the number of codebooks before the interleaving!
+        &#34;&#34;&#34;
+        self.model.set_num_codebooks(n)
+
+    @property
+    def num_virtual_steps(self) -&gt; float:
+        &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        &#34;&#34;&#34;
+        return 2 if self.per_timestep else 1
+
+    @property
+    def frame_rate(self) -&gt; float:
+        return self.model.frame_rate * self.num_virtual_steps
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def channels(self) -&gt; int:
+        return 2
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        return self.model.cardinality
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        raise NotImplementedError(&#34;Not supported, use encode and decode.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        B, C, T = x.shape
+        assert C == self.channels, f&#34;Expecting stereo audio but audio num channels is {C}&#34;
+
+        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
+        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
+        indices = torch.stack([indices_c0, indices_c1], dim=0)
+        scales: tp.Optional[torch.Tensor] = None
+        if scales_c0 is not None and scales_c1 is not None:
+            scales = torch.stack([scales_c0, scales_c1], dim=1)
+
+        if self.per_timestep:
+            indices = rearrange(indices, &#39;c b k t -&gt; b k (t c)&#39;, c=2)
+        else:
+            indices = rearrange(indices, &#39;c b k t -&gt; b (k c) t&#39;, c=2)
+
+        return (indices, scales)
+
+    def get_left_right_codes(self, codes: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        if self.per_timestep:
+            codes = rearrange(codes, &#39;b k (t c) -&gt; c b k t&#39;, c=2)
+        else:
+            codes = rearrange(codes, &#39;b (k c) t -&gt; c b k t&#39;, c=2)
+        return codes[0], codes[1]
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0, &#34;Provided codes&#39; number of timesteps does not match&#34;
+        assert K == self.num_codebooks, &#34;Provided codes&#39; number of codebooks does not match&#34;
+
+        scale_c0, scale_c1 = None, None
+        if scale is not None:
+            assert scale.size(0) == B and scale.size(1) == 2, f&#34;Scale has unexpected shape: {scale.shape}&#34;
+            scale_c0 = scale[0, ...]
+            scale_c1 = scale[1, ...]
+
+        codes_c0, codes_c1 = self.get_left_right_codes(codes)
+        audio_c0 = self.model.decode(codes_c0, scale_c0)
+        audio_c1 = self.model.decode(codes_c1, scale_c1)
+        return torch.cat([audio_c0, audio_c1], dim=1)
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        raise NotImplementedError(&#34;Not supported by interleaved stereo wrapped models.&#34;)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel"><code class="flex name class">
+<span>class <span class="ident">CompressionModel</span></span>
+<span>(</span><span>*args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base API for all compression model that aim at being used as audio tokenizers
+with a language model.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CompressionModel(ABC, nn.Module):
+    &#34;&#34;&#34;Base API for all compression model that aim at being used as audio tokenizers
+    with a language model.
+    &#34;&#34;&#34;
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        ...
+
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;See `EncodecModel.encode`.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;See `EncodecModel.decode`.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        ...
+
+    @property
+    @abstractmethod
+    def channels(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def frame_rate(self) -&gt; float:
+        ...
+
+    @property
+    @abstractmethod
+    def sample_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def cardinality(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def num_codebooks(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def total_codebooks(self) -&gt; int:
+        ...
+
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        ...
+
+    @staticmethod
+    def get_pretrained(
+            name: str, device: tp.Union[torch.device, str] = &#39;cpu&#39;
+            ) -&gt; &#39;CompressionModel&#39;:
+        &#34;&#34;&#34;Instantiate a CompressionModel from a given pretrained model.
+
+        Args:
+            name (Path or str): name of the pretrained model. See after.
+            device (torch.device or str): Device on which the model is loaded.
+
+        Pretrained models:
+            - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
+            - dac_24khz (same)
+            - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
+            - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
+            - your own model on HugginFace. Export instructions to come...
+        &#34;&#34;&#34;
+
+        from . import builders, loaders
+        model: CompressionModel
+        if name in [&#39;dac_44khz&#39;, &#39;dac_24khz&#39;]:
+            model_type = name.split(&#39;_&#39;)[1]
+            logger.info(&#34;Getting pretrained compression model from DAC %s&#34;, model_type)
+            model = DAC(model_type)
+        elif name in [&#39;debug_compression_model&#39;]:
+            logger.info(&#34;Getting pretrained compression model for debug&#34;)
+            model = builders.get_debug_compression_model()
+        elif Path(name).exists():
+            # We assume here if the paths exist that it is in fact an AC checkpoint
+            # that was exported using `audiocraft.utils.export` functions.
+            model = loaders.load_compression_model(name, device=device)
+        else:
+            logger.info(&#34;Getting pretrained compression model from HF %s&#34;, name)
+            hf_model = HFEncodecModel.from_pretrained(name)
+            model = HFEncodecCompressionModel(hf_model).to(device)
+        return model.to(device).eval()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.DAC" href="#audiocraft.models.encodec.DAC">DAC</a></li>
+<li><a title="audiocraft.models.encodec.EncodecModel" href="#audiocraft.models.encodec.EncodecModel">EncodecModel</a></li>
+<li><a title="audiocraft.models.encodec.HFEncodecCompressionModel" href="#audiocraft.models.encodec.HFEncodecCompressionModel">HFEncodecCompressionModel</a></li>
+<li><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel">InterleaveStereoCompressionModel</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.get_pretrained"><code class="name flex">
+<span>def <span class="ident">get_pretrained</span></span>(<span>name: str, device: Union[torch.device, str] = 'cpu') ‑> <a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a CompressionModel from a given pretrained model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>name of the pretrained model. See after.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code></dt>
+<dd>Device on which the model is loaded.</dd>
+</dl>
+<p>Pretrained models:
+- dac_44khz (<a href="https://github.com/descriptinc/descript-audio-codec">https://github.com/descriptinc/descript-audio-codec</a>)
+- dac_24khz (same)
+- facebook/encodec_24khz (<a href="https://huggingface.co/facebook/encodec_24khz">https://huggingface.co/facebook/encodec_24khz</a>)
+- facebook/encodec_32khz (<a href="https://huggingface.co/facebook/encodec_32khz">https://huggingface.co/facebook/encodec_32khz</a>)
+- your own model on HugginFace. Export instructions to come&hellip;</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_pretrained(
+        name: str, device: tp.Union[torch.device, str] = &#39;cpu&#39;
+        ) -&gt; &#39;CompressionModel&#39;:
+    &#34;&#34;&#34;Instantiate a CompressionModel from a given pretrained model.
+
+    Args:
+        name (Path or str): name of the pretrained model. See after.
+        device (torch.device or str): Device on which the model is loaded.
+
+    Pretrained models:
+        - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
+        - dac_24khz (same)
+        - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
+        - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
+        - your own model on HugginFace. Export instructions to come...
+    &#34;&#34;&#34;
+
+    from . import builders, loaders
+    model: CompressionModel
+    if name in [&#39;dac_44khz&#39;, &#39;dac_24khz&#39;]:
+        model_type = name.split(&#39;_&#39;)[1]
+        logger.info(&#34;Getting pretrained compression model from DAC %s&#34;, model_type)
+        model = DAC(model_type)
+    elif name in [&#39;debug_compression_model&#39;]:
+        logger.info(&#34;Getting pretrained compression model for debug&#34;)
+        model = builders.get_debug_compression_model()
+    elif Path(name).exists():
+        # We assume here if the paths exist that it is in fact an AC checkpoint
+        # that was exported using `audiocraft.utils.export` functions.
+        model = loaders.load_compression_model(name, device=device)
+    else:
+        logger.info(&#34;Getting pretrained compression model from HF %s&#34;, name)
+        hf_model = HFEncodecModel.from_pretrained(name)
+        model = HFEncodecCompressionModel(hf_model).to(device)
+    return model.to(device).eval()</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.cardinality"><code class="name">var <span class="ident">cardinality</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def cardinality(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def channels(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def frame_rate(self) -&gt; float:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def num_codebooks(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def sample_rate(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def total_codebooks(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.models.encodec.EncodecModel.decode" href="#audiocraft.models.encodec.EncodecModel.decode">EncodecModel.decode()</a></code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;See `EncodecModel.decode`.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.decode_latent"><code class="name flex">
+<span>def <span class="ident">decode_latent</span></span>(<span>self, codes: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode from the discrete codes to continuous latent space.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def decode_latent(self, codes: torch.Tensor):
+    &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.models.encodec.EncodecModel.encode" href="#audiocraft.models.encodec.EncodecModel.encode">EncodecModel.encode()</a></code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    &#34;&#34;&#34;See `EncodecModel.encode`.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> <a title="audiocraft.quantization.base.QuantizedResult" href="../quantization/base.html#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the active number of codebooks used by the quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.encodec.DAC"><code class="flex name class">
+<span>class <span class="ident">DAC</span></span>
+<span>(</span><span>model_type: str = '44khz')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base API for all compression model that aim at being used as audio tokenizers
+with a language model.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DAC(CompressionModel):
+    def __init__(self, model_type: str = &#34;44khz&#34;):
+        super().__init__()
+        try:
+            import dac.utils
+        except ImportError:
+            raise RuntimeError(&#34;Could not import dac, make sure it is installed, &#34;
+                               &#34;please run `pip install descript-audio-codec`&#34;)
+        self.model = dac.utils.load_model(model_type=model_type)
+        self.n_quantizers = self.total_codebooks
+        self.model.eval()
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        # We don&#39;t support training with this.
+        raise NotImplementedError(&#34;Forward and training with DAC not supported.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        codes = self.model.encode(x, self.n_quantizers)[1]
+        return codes[:, :self.n_quantizers], None
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        assert scale is None
+        z_q = self.decode_latent(codes)
+        return self.model.decode(z_q)
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.model.quantizer.from_codes(codes)[0]
+
+    @property
+    def channels(self) -&gt; int:
+        return 1
+
+    @property
+    def frame_rate(self) -&gt; float:
+        return self.model.sample_rate / self.model.hop_length
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def cardinality(self) -&gt; int:
+        return self.model.codebook_size
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_quantizers
+
+    @property
+    def total_codebooks(self) -&gt; int:
+        return self.model.n_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        assert n &gt;= 1
+        assert n &lt;= self.total_codebooks
+        self.n_quantizers = n</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.DAC.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.DAC.cardinality"><code class="name">var <span class="ident">cardinality</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self) -&gt; int:
+    return self.model.codebook_size</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def channels(self) -&gt; int:
+    return 1</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; float:
+    return self.model.sample_rate / self.model.hop_length</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self) -&gt; int:
+    return self.n_quantizers</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    return self.model.sample_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.DAC.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self) -&gt; int:
+    return self.model.n_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode_latent" href="#audiocraft.models.encodec.CompressionModel.decode_latent">decode_latent</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.get_pretrained" href="#audiocraft.models.encodec.CompressionModel.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel"><code class="flex name class">
+<span>class <span class="ident">EncodecModel</span></span>
+<span>(</span><span>encoder: torch.nn.modules.module.Module, decoder: torch.nn.modules.module.Module, quantizer: <a title="audiocraft.quantization.base.BaseQuantizer" href="../quantization/base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a>, frame_rate: int, sample_rate: int, channels: int, causal: bool = False, renormalize: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encodec model operating on the raw waveform.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>encoder</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Encoder network.</dd>
+<dt><strong><code>decoder</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Decoder network.</dd>
+<dt><strong><code>quantizer</code></strong> :&ensp;<code>qt.BaseQuantizer</code></dt>
+<dd>Quantizer network.</dd>
+<dt><strong><code>frame_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Frame rate for the latent representation.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio sample rate.</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of audio channels.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use a causal version of the model.</dd>
+<dt><strong><code>renormalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to renormalize the audio before running the model.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EncodecModel(CompressionModel):
+    &#34;&#34;&#34;Encodec model operating on the raw waveform.
+
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (int): Frame rate for the latent representation.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        renormalize (bool): Whether to renormalize the audio before running the model.
+    &#34;&#34;&#34;
+    # we need assignment to override the property in the abstract class,
+    # I couldn&#39;t find a better way...
+    frame_rate: float = 0
+    sample_rate: int = 0
+    channels: int = 0
+
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 quantizer: qt.BaseQuantizer,
+                 frame_rate: int,
+                 sample_rate: int,
+                 channels: int,
+                 causal: bool = False,
+                 renormalize: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.renormalize = renormalize
+        self.causal = causal
+        if self.causal:
+            # we force disabling here to avoid handling linear overlap of segments
+            # as supported in original EnCodec codebase.
+            assert not self.renormalize, &#39;Causal model does not support renormalize&#39;
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of quantizer codebooks available.&#34;&#34;&#34;
+        return self.quantizer.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        return self.quantizer.num_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.&#34;&#34;&#34;
+        self.quantizer.set_num_codebooks(n)
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.&#34;&#34;&#34;
+        return self.quantizer.bins
+
+    def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        scale: tp.Optional[torch.Tensor]
+        if self.renormalize:
+            mono = x.mean(dim=1, keepdim=True)
+            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+            scale = 1e-8 + volume
+            x = x / scale
+            scale = scale.view(-1, 1)
+        else:
+            scale = None
+        return x, scale
+
+    def postprocess(self,
+                    x: torch.Tensor,
+                    scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+        if scale is not None:
+            assert self.renormalize
+            x = x * scale.view(-1, 1, 1)
+        return x
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        x, scale = self.preprocess(x)
+
+        emb = self.encoder(x)
+        q_res = self.quantizer(emb, self.frame_rate)
+        out = self.decoder(q_res.x)
+
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] &gt;= length, (out.shape[-1], length)
+        out = out[..., :length]
+
+        q_res.x = self.postprocess(out, scale)
+
+        return q_res
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+
+        Returns:
+            codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
+                codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+                scale a float tensor containing the scale for audio renormalizealization.
+        &#34;&#34;&#34;
+        assert x.dim() == 3
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        codes = self.quantizer.encode(emb)
+        return codes, scale
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+        audio denormalization if needed.
+
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+            scale (torch.Tensor, optional): Float tensor containing the scale value.
+
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        &#34;&#34;&#34;
+        emb = self.decode_latent(codes)
+        out = self.decoder(emb)
+        out = self.postprocess(out, scale)
+        # out contains extra padding added by the encoder and decoder
+        return out
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.quantizer.decode(codes)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.cardinality"><code class="name">var <span class="ident">cardinality</span></code></dt>
+<dd>
+<div class="desc"><p>Cardinality of each codebook.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self):
+    &#34;&#34;&#34;Cardinality of each codebook.&#34;&#34;&#34;
+    return self.quantizer.bins</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Active number of codebooks used by the quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Active number of codebooks used by the quantizer.&#34;&#34;&#34;
+    return self.quantizer.num_codebooks</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of quantizer codebooks available.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    &#34;&#34;&#34;Total number of quantizer codebooks available.&#34;&#34;&#34;
+    return self.quantizer.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to a reconstructed representation, using the scale to perform
+audio denormalization if needed.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>codes</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Int tensor of shape [B, K, T]</dd>
+<dt><strong><code>scale</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>Float tensor containing the scale value.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+    audio denormalization if needed.
+
+    Args:
+        codes (torch.Tensor): Int tensor of shape [B, K, T]
+        scale (torch.Tensor, optional): Float tensor containing the scale value.
+
+    Returns:
+        out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+    &#34;&#34;&#34;
+    emb = self.decode_latent(codes)
+    out = self.decoder(emb)
+    out = self.postprocess(out, scale)
+    # out contains extra padding added by the encoder and decoder
+    return out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode the given input tensor to quantized representation along with scale parameter.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Float tensor of shape [B, C, T]</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
+codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+scale a float tensor containing the scale for audio renormalizealization.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+    Args:
+        x (torch.Tensor): Float tensor of shape [B, C, T]
+
+    Returns:
+        codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
+            codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+            scale a float tensor containing the scale for audio renormalizealization.
+    &#34;&#34;&#34;
+    assert x.dim() == 3
+    x, scale = self.preprocess(x)
+    emb = self.encoder(x)
+    codes = self.quantizer.encode(emb)
+    return codes, scale</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.postprocess"><code class="name flex">
+<span>def <span class="ident">postprocess</span></span>(<span>self, x: torch.Tensor, scale: Optional[torch.Tensor] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def postprocess(self,
+                x: torch.Tensor,
+                scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+    if scale is not None:
+        assert self.renormalize
+        x = x * scale.view(-1, 1, 1)
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.preprocess"><code class="name flex">
+<span>def <span class="ident">preprocess</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    scale: tp.Optional[torch.Tensor]
+    if self.renormalize:
+        mono = x.mean(dim=1, keepdim=True)
+        volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+        scale = 1e-8 + volume
+        x = x / scale
+        scale = scale.view(-1, 1)
+    else:
+        scale = None
+    return x, scale</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode_latent" href="#audiocraft.models.encodec.CompressionModel.decode_latent">decode_latent</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.get_pretrained" href="#audiocraft.models.encodec.CompressionModel.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel"><code class="flex name class">
+<span>class <span class="ident">HFEncodecCompressionModel</span></span>
+<span>(</span><span>model: transformers.models.encodec.modeling_encodec.EncodecModel)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around HuggingFace Encodec.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class HFEncodecCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wrapper around HuggingFace Encodec.
+    &#34;&#34;&#34;
+    def __init__(self, model: HFEncodecModel):
+        super().__init__()
+        self.model = model
+        bws = self.model.config.target_bandwidths
+        num_codebooks = [
+            bw * 1000 / (self.frame_rate * math.log2(self.cardinality))
+            for bw in bws
+        ]
+        deltas = [nc - int(nc) for nc in num_codebooks]
+        # Checking we didn&#39;t do some bad maths and we indeed have integers!
+        assert all(deltas) &lt;= 1e-3, deltas
+        self.possible_num_codebooks = [int(nc) for nc in num_codebooks]
+        self.set_num_codebooks(max(self.possible_num_codebooks))
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        # We don&#39;t support training with this.
+        raise NotImplementedError(&#34;Forward and training with HF EncodecModel not supported.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        bandwidth_index = self.possible_num_codebooks.index(self.num_codebooks)
+        bandwidth = self.model.config.target_bandwidths[bandwidth_index]
+        res = self.model.encode(x, None, bandwidth)
+        assert len(res[0]) == 1
+        assert len(res[1]) == 1
+        return res[0][0], res[1][0]
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        if scale is None:
+            scales = [None]  # type: ignore
+        else:
+            scales = scale  # type: ignore
+        res = self.model.decode(codes[None], scales)
+        return res[0]
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        return self.model.quantizer.decode(codes.transpose(0, 1))
+
+    @property
+    def channels(self) -&gt; int:
+        return self.model.config.audio_channels
+
+    @property
+    def frame_rate(self) -&gt; float:
+        hop_length = int(np.prod(self.model.config.upsampling_ratios))
+        return self.sample_rate / hop_length
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.config.sampling_rate
+
+    @property
+    def cardinality(self) -&gt; int:
+        return self.model.config.codebook_size
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self._num_codebooks
+
+    @property
+    def total_codebooks(self) -&gt; int:
+        return max(self.possible_num_codebooks)
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        if n not in self.possible_num_codebooks:
+            raise ValueError(f&#34;Allowed values for num codebooks: {self.possible_num_codebooks}&#34;)
+        self._num_codebooks = n</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.cardinality"><code class="name">var <span class="ident">cardinality</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self) -&gt; int:
+    return self.model.config.codebook_size</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def channels(self) -&gt; int:
+    return self.model.config.audio_channels</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; float:
+    hop_length = int(np.prod(self.model.config.upsampling_ratios))
+    return self.sample_rate / hop_length</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self) -&gt; int:
+    return self._num_codebooks</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    return self.model.config.sampling_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.HFEncodecCompressionModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self) -&gt; int:
+    return max(self.possible_num_codebooks)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode_latent" href="#audiocraft.models.encodec.CompressionModel.decode_latent">decode_latent</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.get_pretrained" href="#audiocraft.models.encodec.CompressionModel.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel"><code class="flex name class">
+<span>class <span class="ident">InterleaveStereoCompressionModel</span></span>
+<span>(</span><span>model: <a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, per_timestep: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wraps a CompressionModel to support stereo inputs. The wrapped model
+will be applied independently to the left and right channels, and both codebooks
+will be interleaved. If the wrapped model returns a representation <code>[B, K ,T]</code> per
+channel, then the output will be <code>[B, K * 2, T]</code>
+or <code>[B, K, T * 2]</code> depending on
+<code>per_timestep</code>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>model</code></strong> :&ensp;<code><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></code></dt>
+<dd>Compression model to wrap.</dd>
+<dt><strong><code>per_timestep</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to interleave on the timestep dimension
+or on the codebooks dimension.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class InterleaveStereoCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wraps a CompressionModel to support stereo inputs. The wrapped model
+    will be applied independently to the left and right channels, and both codebooks
+    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
+    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
+    `per_timestep`.
+
+    Args:
+        model (CompressionModel): Compression model to wrap.
+        per_timestep (bool): Whether to interleave on the timestep dimension
+            or on the codebooks dimension.
+    &#34;&#34;&#34;
+    def __init__(self, model: CompressionModel, per_timestep: bool = False):
+        super().__init__()
+        self.model = model
+        self.per_timestep = per_timestep
+        assert self.model.channels == 1, &#34;Wrapped model is expected to be for monophonic audio&#34;
+
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+        ..Warning:: this reports the number of codebooks after the interleaving
+        of the codebooks!
+        &#34;&#34;&#34;
+        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+        ..Warning:: this sets the number of codebooks before the interleaving!
+        &#34;&#34;&#34;
+        self.model.set_num_codebooks(n)
+
+    @property
+    def num_virtual_steps(self) -&gt; float:
+        &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        &#34;&#34;&#34;
+        return 2 if self.per_timestep else 1
+
+    @property
+    def frame_rate(self) -&gt; float:
+        return self.model.frame_rate * self.num_virtual_steps
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def channels(self) -&gt; int:
+        return 2
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        return self.model.cardinality
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        raise NotImplementedError(&#34;Not supported, use encode and decode.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        B, C, T = x.shape
+        assert C == self.channels, f&#34;Expecting stereo audio but audio num channels is {C}&#34;
+
+        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
+        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
+        indices = torch.stack([indices_c0, indices_c1], dim=0)
+        scales: tp.Optional[torch.Tensor] = None
+        if scales_c0 is not None and scales_c1 is not None:
+            scales = torch.stack([scales_c0, scales_c1], dim=1)
+
+        if self.per_timestep:
+            indices = rearrange(indices, &#39;c b k t -&gt; b k (t c)&#39;, c=2)
+        else:
+            indices = rearrange(indices, &#39;c b k t -&gt; b (k c) t&#39;, c=2)
+
+        return (indices, scales)
+
+    def get_left_right_codes(self, codes: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        if self.per_timestep:
+            codes = rearrange(codes, &#39;b k (t c) -&gt; c b k t&#39;, c=2)
+        else:
+            codes = rearrange(codes, &#39;b (k c) t -&gt; c b k t&#39;, c=2)
+        return codes[0], codes[1]
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0, &#34;Provided codes&#39; number of timesteps does not match&#34;
+        assert K == self.num_codebooks, &#34;Provided codes&#39; number of codebooks does not match&#34;
+
+        scale_c0, scale_c1 = None, None
+        if scale is not None:
+            assert scale.size(0) == B and scale.size(1) == 2, f&#34;Scale has unexpected shape: {scale.shape}&#34;
+            scale_c0 = scale[0, ...]
+            scale_c1 = scale[1, ...]
+
+        codes_c0, codes_c1 = self.get_left_right_codes(codes)
+        audio_c0 = self.model.decode(codes_c0, scale_c0)
+        audio_c1 = self.model.decode(codes_c1, scale_c1)
+        return torch.cat([audio_c0, audio_c1], dim=1)
+
+    def decode_latent(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Decode from the discrete codes to continuous latent space.&#34;&#34;&#34;
+        raise NotImplementedError(&#34;Not supported by interleaved stereo wrapped models.&#34;)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.cardinality"><code class="name">var <span class="ident">cardinality</span></code></dt>
+<dd>
+<div class="desc"><p>Cardinality of each codebook.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self):
+    &#34;&#34;&#34;Cardinality of each codebook.
+    &#34;&#34;&#34;
+    return self.model.cardinality</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def channels(self) -&gt; int:
+    return 2</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; float:
+    return self.model.frame_rate * self.num_virtual_steps</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Active number of codebooks used by the quantizer.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;this reports the number of codebooks after the interleaving</p>
+</div>
+<p>of the codebooks!</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+    ..Warning:: this reports the number of codebooks after the interleaving
+    of the codebooks!
+    &#34;&#34;&#34;
+    return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.num_virtual_steps"><code class="name">var <span class="ident">num_virtual_steps</span> : float</code></dt>
+<dd>
+<div class="desc"><p>Return the number of virtual steps, e.g. one real step
+will be split into that many steps.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_virtual_steps(self) -&gt; float:
+    &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+    will be split into that many steps.
+    &#34;&#34;&#34;
+    return 2 if self.per_timestep else 1</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    return self.model.sample_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    return self.model.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.get_left_right_codes"><code class="name flex">
+<span>def <span class="ident">get_left_right_codes</span></span>(<span>self, codes: torch.Tensor) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_left_right_codes(self, codes: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    if self.per_timestep:
+        codes = rearrange(codes, &#39;b k (t c) -&gt; c b k t&#39;, c=2)
+    else:
+        codes = rearrange(codes, &#39;b (k c) t -&gt; c b k t&#39;, c=2)
+    return codes[0], codes[1]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.InterleaveStereoCompressionModel.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the active number of codebooks used by the quantizer.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;this sets the number of codebooks before the interleaving!</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+    ..Warning:: this sets the number of codebooks before the interleaving!
+    &#34;&#34;&#34;
+    self.model.set_num_codebooks(n)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode_latent" href="#audiocraft.models.encodec.CompressionModel.decode_latent">decode_latent</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.get_pretrained" href="#audiocraft.models.encodec.CompressionModel.get_pretrained">get_pretrained</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.call_super_init" href="#audiocraft.models.encodec.CompressionModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.cardinality" href="#audiocraft.models.encodec.CompressionModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.channels" href="#audiocraft.models.encodec.CompressionModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode_latent" href="#audiocraft.models.encodec.CompressionModel.decode_latent">decode_latent</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.dump_patches" href="#audiocraft.models.encodec.CompressionModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.frame_rate" href="#audiocraft.models.encodec.CompressionModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.get_pretrained" href="#audiocraft.models.encodec.CompressionModel.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.num_codebooks" href="#audiocraft.models.encodec.CompressionModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.sample_rate" href="#audiocraft.models.encodec.CompressionModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.total_codebooks" href="#audiocraft.models.encodec.CompressionModel.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.training" href="#audiocraft.models.encodec.CompressionModel.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.DAC" href="#audiocraft.models.encodec.DAC">DAC</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.DAC.call_super_init" href="#audiocraft.models.encodec.DAC.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.cardinality" href="#audiocraft.models.encodec.DAC.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.channels" href="#audiocraft.models.encodec.DAC.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.dump_patches" href="#audiocraft.models.encodec.DAC.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.frame_rate" href="#audiocraft.models.encodec.DAC.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.num_codebooks" href="#audiocraft.models.encodec.DAC.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.sample_rate" href="#audiocraft.models.encodec.DAC.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.total_codebooks" href="#audiocraft.models.encodec.DAC.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.DAC.training" href="#audiocraft.models.encodec.DAC.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.EncodecModel" href="#audiocraft.models.encodec.EncodecModel">EncodecModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.EncodecModel.cardinality" href="#audiocraft.models.encodec.EncodecModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.channels" href="#audiocraft.models.encodec.EncodecModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.decode" href="#audiocraft.models.encodec.EncodecModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.encode" href="#audiocraft.models.encodec.EncodecModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.frame_rate" href="#audiocraft.models.encodec.EncodecModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.num_codebooks" href="#audiocraft.models.encodec.EncodecModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.postprocess" href="#audiocraft.models.encodec.EncodecModel.postprocess">postprocess</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.preprocess" href="#audiocraft.models.encodec.EncodecModel.preprocess">preprocess</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.sample_rate" href="#audiocraft.models.encodec.EncodecModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.total_codebooks" href="#audiocraft.models.encodec.EncodecModel.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel" href="#audiocraft.models.encodec.HFEncodecCompressionModel">HFEncodecCompressionModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.call_super_init" href="#audiocraft.models.encodec.HFEncodecCompressionModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.cardinality" href="#audiocraft.models.encodec.HFEncodecCompressionModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.channels" href="#audiocraft.models.encodec.HFEncodecCompressionModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.dump_patches" href="#audiocraft.models.encodec.HFEncodecCompressionModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.frame_rate" href="#audiocraft.models.encodec.HFEncodecCompressionModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.num_codebooks" href="#audiocraft.models.encodec.HFEncodecCompressionModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.sample_rate" href="#audiocraft.models.encodec.HFEncodecCompressionModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.total_codebooks" href="#audiocraft.models.encodec.HFEncodecCompressionModel.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.HFEncodecCompressionModel.training" href="#audiocraft.models.encodec.HFEncodecCompressionModel.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel">InterleaveStereoCompressionModel</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.call_super_init" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.cardinality" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.channels" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.dump_patches" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.frame_rate" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.get_left_right_codes" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.get_left_right_codes">get_left_right_codes</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.num_codebooks" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.num_virtual_steps" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.num_virtual_steps">num_virtual_steps</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.sample_rate" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.total_codebooks" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.InterleaveStereoCompressionModel.training" href="#audiocraft.models.encodec.InterleaveStereoCompressionModel.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/index.html b/api_docs/audiocraft/models/index.html
new file mode 100644
index 00000000..78c60d86
--- /dev/null
+++ b/api_docs/audiocraft/models/index.html
@@ -0,0 +1,132 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models API documentation</title>
+<meta name="description" content="Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models</code></h1>
+</header>
+<section id="section-intro">
+<p>Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;
+Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.
+&#34;&#34;&#34;
+# flake8: noqa
+from . import builders, loaders
+from .encodec import (
+    CompressionModel, EncodecModel, DAC,
+    HFEncodecModel, HFEncodecCompressionModel)
+from .audiogen import AudioGen
+from .lm import LMModel
+from .multibanddiffusion import MultiBandDiffusion
+from .musicgen import MusicGen
+from .unet import DiffusionUnet</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.models.audiogen" href="audiogen.html">audiocraft.models.audiogen</a></code></dt>
+<dd>
+<div class="desc"><p>Main model for using AudioGen. This will combine all the required components
+and provide easy access to the generation API.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.builders" href="builders.html">audiocraft.models.builders</a></code></dt>
+<dd>
+<div class="desc"><p>All the functions to build the relevant models and modules
+from the Hydra config.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.encodec" href="encodec.html">audiocraft.models.encodec</a></code></dt>
+<dd>
+<div class="desc"><p>Compression models or wrapper around existing models.
+Also defines the main interface that a model must follow to be usable as an audio tokenizer.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.lm" href="lm.html">audiocraft.models.lm</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.loaders" href="loaders.html">audiocraft.models.loaders</a></code></dt>
+<dd>
+<div class="desc"><p>Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- 'xp.cfg': the hydra config as dumped …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.multibanddiffusion" href="multibanddiffusion.html">audiocraft.models.multibanddiffusion</a></code></dt>
+<dd>
+<div class="desc"><p>Multi Band Diffusion models as described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.musicgen" href="musicgen.html">audiocraft.models.musicgen</a></code></dt>
+<dd>
+<div class="desc"><p>Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.unet" href="unet.html">audiocraft.models.unet</a></code></dt>
+<dd>
+<div class="desc"><p>Pytorch Unet Module used for diffusion.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.models.audiogen" href="audiogen.html">audiocraft.models.audiogen</a></code></li>
+<li><code><a title="audiocraft.models.builders" href="builders.html">audiocraft.models.builders</a></code></li>
+<li><code><a title="audiocraft.models.encodec" href="encodec.html">audiocraft.models.encodec</a></code></li>
+<li><code><a title="audiocraft.models.lm" href="lm.html">audiocraft.models.lm</a></code></li>
+<li><code><a title="audiocraft.models.loaders" href="loaders.html">audiocraft.models.loaders</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion" href="multibanddiffusion.html">audiocraft.models.multibanddiffusion</a></code></li>
+<li><code><a title="audiocraft.models.musicgen" href="musicgen.html">audiocraft.models.musicgen</a></code></li>
+<li><code><a title="audiocraft.models.unet" href="unet.html">audiocraft.models.unet</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/lm.html b/api_docs/audiocraft/models/lm.html
new file mode 100644
index 00000000..facc67f2
--- /dev/null
+++ b/api_docs/audiocraft/models/lm.html
@@ -0,0 +1,1745 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.lm API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.lm</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from functools import partial
+import logging
+import math
+import typing as tp
+
+import torch
+from torch import nn
+
+from ..utils import utils
+from ..modules.streaming import StreamingModule, State
+from ..modules.transformer import StreamingTransformer, create_norm_fn
+from ..modules.conditioners import (
+    ConditionFuser,
+    ClassifierFreeGuidanceDropout,
+    AttributeDropout,
+    ConditioningProvider,
+    ConditioningAttributes,
+    ConditionType,
+)
+from ..modules.codebooks_patterns import CodebooksPatternProvider
+from ..modules.activations import get_activation_fn
+
+
+logger = logging.getLogger(__name__)
+ConditionTensors = tp.Dict[str, ConditionType]
+CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
+
+
+def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
+    &#34;&#34;&#34;LM layer initialization.
+    Inspired from xlformers: https://github.com/fairinternal/xlformers
+
+    Args:
+        method (str): Method name for init function. Valid options are:
+            &#39;gaussian&#39;, &#39;uniform&#39;.
+        input_dim (int): Input dimension of the initialized module.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+    &#34;&#34;&#34;
+    # Compute std
+    std = 1 / math.sqrt(input_dim)
+    # Rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+
+    if method == &#39;gaussian&#39;:
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    elif method == &#39;uniform&#39;:
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
+    else:
+        raise ValueError(&#34;Unsupported layer initialization method&#34;)
+
+
+def init_layer(m: nn.Module,
+               method: str,
+               init_depth: tp.Optional[int] = None,
+               zero_bias_init: bool = False):
+    &#34;&#34;&#34;Wrapper around ``get_init_fn`` for proper initialization of LM modules.
+
+    Args:
+        m (nn.Module): Module to initialize.
+        method (str): Method name for the init function.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
+    &#34;&#34;&#34;
+    if isinstance(m, nn.Linear):
+        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+        if zero_bias_init and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+
+
+class ScaledEmbedding(nn.Embedding):
+    &#34;&#34;&#34;Boost learning rate for embeddings (with `scale`).
+    &#34;&#34;&#34;
+    def __init__(self, *args, lr=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr = lr
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        return group
+
+
+@dataclass
+class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]
+
+
+class LMModel(StreamingModule):
+    &#34;&#34;&#34;Transformer-based language model on multiple streams of codes.
+
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        n_q (int): Number of parallel streams to model.
+        card (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (float, optional): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (str, optional): Method for weight initialization.
+        depthwise_init (str, optional): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    &#34;&#34;&#34;
+    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
+                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
+                 hidden_scale: int = 4, norm: str = &#39;layer_norm&#39;, norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.card = card
+        embed_dim = self.card + 1
+        self.n_q = n_q
+        self.dim = dim
+        self.pattern_provider = pattern_provider
+        self.two_step_cfg = two_step_cfg
+        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
+        if &#39;activation&#39; in kwargs:
+            kwargs[&#39;activation&#39;] = get_activation_fn(kwargs[&#39;activation&#39;])
+        self.transformer = StreamingTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first, **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__[&#39;_fsdp&#39;] = None
+
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        &#34;&#34;&#34;Initialization of the transformer module weights.
+
+        Args:
+            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
+                &#39;current&#39; where the depth corresponds to the current layer index or &#39;global&#39; where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initialize bias to zero or not.
+        &#34;&#34;&#34;
+        assert depthwise_init is None or depthwise_init in [&#39;current&#39;, &#39;global&#39;]
+        assert depthwise_init is None or weight_init is not None, \
+            &#34;If &#39;depthwise_init&#39; is defined, a &#39;weight_init&#39; method should be provided.&#34;
+        assert not zero_bias_init or weight_init is not None, \
+            &#34;If &#39;zero_bias_init&#39;, a &#39;weight_init&#39; method should be provided&#34;
+
+        if weight_init is None:
+            return
+
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == &#39;current&#39;:
+                depth = layer_idx + 1
+            elif depthwise_init == &#39;global&#39;:
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+
+        for linear in self.linears:
+            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+    @property
+    def special_token_id(self) -&gt; int:
+        return self.card
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_q
+
+    def forward(self, sequence: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+
+        Args:
+            indices (torch.Tensor): Indices of the codes to model.
+            conditions (list of ConditioningAttributes): Conditions to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: Logits.
+        &#34;&#34;&#34;
+        B, K, S = sequence.shape
+        assert K == self.num_codebooks, &#34;Sequence shape must match the specified number of codebooks&#34;
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        if condition_tensors is None:
+            assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+            # apply dropout modules
+            conditions = self.cfg_dropout(conditions)
+            conditions = self.att_dropout(conditions)
+            tokenized = self.condition_provider.tokenize(conditions)
+            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+        out = self.transformer(input_, cross_attention_src=cross_attention_input)
+        if self.out_norm:
+            out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+            logits = logits[:, :, -S:]
+
+        return logits  # [B, K, S, card]
+
+    def compute_predictions(
+            self, codes: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+        &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            conditions (list of ConditioningAttributes): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        &#34;&#34;&#34;
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=True
+        )
+        # apply model on pattern sequence
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+        return LMOutput(logits, logits_mask)
+
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           cfg_conditions: CFGConditions,
+                           unconditional_state: State,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None,
+                           two_step_cfg: tp.Optional[bool] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coef (float, optional): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        &#34;&#34;&#34;
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if two_step_cfg and cfg_conditions != {}:
+            assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
+            condition_tensors, null_condition_tensors = cfg_conditions
+            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
+            state = self.get_streaming_state()
+            self.set_streaming_state(unconditional_state)
+            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
+            unconditional_state.update(self.get_streaming_state())
+            self.set_streaming_state(state)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
+        else:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting both conditional and unconditional logits.
+                sequence = torch.cat([sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+            else:
+                logits = all_logits
+
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+
+        # Apply softmax for sampling if temp &gt; 0. Else, do greedy sampling to avoid zero division error.
+        if use_sampling and temp &gt; 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p &gt; 0.0:
+                next_token = utils.sample_top_p(probs, p=top_p)
+            elif top_k &gt; 0:
+                next_token = utils.sample_top_k(probs, k=top_k)
+            else:
+                next_token = utils.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+        return next_token
+
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: tp.Optional[bool] = None,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+        Args:
+            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
+            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coeff (float, optional): Classifier-free guidance coefficient.
+            two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
+            remove_prompts (bool): Whether to remove prompts from generation or not.
+            check (bool): Whether to apply further checks on generated sequence.
+            callback (Callback, optional): Callback function to report generation progress.
+        Returns:
+            torch.Tensor: Generated tokens.
+        &#34;&#34;&#34;
+        assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsistent inputs shapes&#34;
+        num_samples = possible_num_samples[0]
+
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        # the reason for that is two-fold:
+        # 1. it is about x2 faster than doing 2 forward passes
+        # 2. avoid the streaming API treating the 2 passes as part of different time steps
+        # We also support doing two different passes, in particular to ensure that
+        # the padding structure is exactly the same between train and test.
+        # With a batch size of 1, this can be slower though.
+        cfg_conditions: CFGConditions
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            if two_step_cfg:
+                cfg_conditions = (
+                    self.condition_provider(self.condition_provider.tokenize(conditions)),
+                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                )
+            else:
+                conditions = conditions + null_conditions
+                tokenized = self.condition_provider.tokenize(conditions)
+                cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+
+        if prompt is None:
+            assert num_samples &gt; 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+        B, K, T = prompt.shape
+        start_offset = T
+        assert start_offset &lt; max_gen_len
+
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+
+        with self.streaming():
+            unconditional_state = self.get_streaming_state()
+            prev_offset = 0
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1]
+                )
+                prev_offset = offset
+                if callback is not None:
+                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+        unconditional_state.clear()
+
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        assert (
+            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+        ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        assert (out_mask[..., :max_gen_len] == 1).all()
+
+        out_start_offset = start_offset if remove_prompts else 0
+        out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+        # ensure the returned codes are all valid
+        assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+        return out_codes</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.lm.get_init_fn"><code class="name flex">
+<span>def <span class="ident">get_init_fn</span></span>(<span>method: str, input_dim: int, init_depth: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LM layer initialization.
+Inspired from xlformers: <a href="https://github.com/fairinternal/xlformers">https://github.com/fairinternal/xlformers</a></p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>method</code></strong> :&ensp;<code>str</code></dt>
+<dd>Method name for init function. Valid options are:
+'gaussian', 'uniform'.</dd>
+<dt><strong><code>input_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Input dimension of the initialized module.</dd>
+<dt><strong><code>init_depth</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Optional init depth value used to rescale
+the standard deviation if defined.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
+    &#34;&#34;&#34;LM layer initialization.
+    Inspired from xlformers: https://github.com/fairinternal/xlformers
+
+    Args:
+        method (str): Method name for init function. Valid options are:
+            &#39;gaussian&#39;, &#39;uniform&#39;.
+        input_dim (int): Input dimension of the initialized module.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+    &#34;&#34;&#34;
+    # Compute std
+    std = 1 / math.sqrt(input_dim)
+    # Rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+
+    if method == &#39;gaussian&#39;:
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    elif method == &#39;uniform&#39;:
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
+    else:
+        raise ValueError(&#34;Unsupported layer initialization method&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.init_layer"><code class="name flex">
+<span>def <span class="ident">init_layer</span></span>(<span>m: torch.nn.modules.module.Module, method: str, init_depth: Optional[int] = None, zero_bias_init: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around <code><a title="audiocraft.models.lm.get_init_fn" href="#audiocraft.models.lm.get_init_fn">get_init_fn()</a></code> for proper initialization of LM modules.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>m</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Module to initialize.</dd>
+<dt><strong><code>method</code></strong> :&ensp;<code>str</code></dt>
+<dd>Method name for the init function.</dd>
+<dt><strong><code>init_depth</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Optional init depth value used to rescale
+the standard deviation if defined.</dd>
+<dt><strong><code>zero_bias_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to initialize the bias to 0 or not.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def init_layer(m: nn.Module,
+               method: str,
+               init_depth: tp.Optional[int] = None,
+               zero_bias_init: bool = False):
+    &#34;&#34;&#34;Wrapper around ``get_init_fn`` for proper initialization of LM modules.
+
+    Args:
+        m (nn.Module): Module to initialize.
+        method (str): Method name for the init function.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
+    &#34;&#34;&#34;
+    if isinstance(m, nn.Linear):
+        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+        if zero_bias_init and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.lm.LMModel"><code class="flex name class">
+<span>class <span class="ident">LMModel</span></span>
+<span>(</span><span>pattern_provider: <a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="../modules/codebooks_patterns.html#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a>, condition_provider: <a title="audiocraft.modules.conditioners.ConditioningProvider" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a>, fuser: <a title="audiocraft.modules.conditioners.ConditionFuser" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a>, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8, hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False, emb_lr: Optional[float] = None, bias_proj: bool = True, weight_init: Optional[str] = None, depthwise_init: Optional[str] = None, zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0, attribute_dropout: Dict[str, Dict[str, float]] = {}, two_step_cfg: bool = False, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transformer-based language model on multiple streams of codes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>pattern_provider</code></strong> :&ensp;<code>CodebooksPatternProvider</code></dt>
+<dd>Pattern provider for codebook interleaving.</dd>
+<dt><strong><code>condition_provider</code></strong> :&ensp;<code>MusicConditioningProvider</code></dt>
+<dd>Conditioning provider from metadata.</dd>
+<dt><strong><code>fuser</code></strong> :&ensp;<code>ConditionFuser</code></dt>
+<dd>Fuser handling the fusing of conditions with language model input.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of parallel streams to model.</dd>
+<dt><strong><code>card</code></strong> :&ensp;<code>int</code></dt>
+<dd>Cardinality, vocabulary size.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the transformer encoder.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads for the transformer encoder.</dd>
+<dt><strong><code>hidden_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Scale for hidden feed forward dimension of the transformer encoder.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_first</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use pre-norm instead of post-norm.</dd>
+<dt><strong><code>emb_lr</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Embedding-specific learning rate.</dd>
+<dt><strong><code>bias_proj</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for output projections.</dd>
+<dt><strong><code>weight_init</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Method for weight initialization.</dd>
+<dt><strong><code>depthwise_init</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Method for depthwise weight initialization.</dd>
+<dt><strong><code>zero_bias_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If true and bias in Linears, initialize bias to zeros.</dd>
+<dt><strong><code>cfg_dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Classifier-free guidance dropout.</dd>
+<dt><strong><code>cfg_coef</code></strong> :&ensp;<code>float</code></dt>
+<dd>Classifier-free guidance coefficient.</dd>
+<dt><strong><code>attribute_dropout</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Attribute dropout probabilities.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to run classifier free-guidance with 2 distinct steps.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional parameters for the transformer encoder.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LMModel(StreamingModule):
+    &#34;&#34;&#34;Transformer-based language model on multiple streams of codes.
+
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        n_q (int): Number of parallel streams to model.
+        card (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (float, optional): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (str, optional): Method for weight initialization.
+        depthwise_init (str, optional): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    &#34;&#34;&#34;
+    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
+                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
+                 hidden_scale: int = 4, norm: str = &#39;layer_norm&#39;, norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.card = card
+        embed_dim = self.card + 1
+        self.n_q = n_q
+        self.dim = dim
+        self.pattern_provider = pattern_provider
+        self.two_step_cfg = two_step_cfg
+        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
+        if &#39;activation&#39; in kwargs:
+            kwargs[&#39;activation&#39;] = get_activation_fn(kwargs[&#39;activation&#39;])
+        self.transformer = StreamingTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first, **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__[&#39;_fsdp&#39;] = None
+
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        &#34;&#34;&#34;Initialization of the transformer module weights.
+
+        Args:
+            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
+                &#39;current&#39; where the depth corresponds to the current layer index or &#39;global&#39; where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initialize bias to zero or not.
+        &#34;&#34;&#34;
+        assert depthwise_init is None or depthwise_init in [&#39;current&#39;, &#39;global&#39;]
+        assert depthwise_init is None or weight_init is not None, \
+            &#34;If &#39;depthwise_init&#39; is defined, a &#39;weight_init&#39; method should be provided.&#34;
+        assert not zero_bias_init or weight_init is not None, \
+            &#34;If &#39;zero_bias_init&#39;, a &#39;weight_init&#39; method should be provided&#34;
+
+        if weight_init is None:
+            return
+
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == &#39;current&#39;:
+                depth = layer_idx + 1
+            elif depthwise_init == &#39;global&#39;:
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+
+        for linear in self.linears:
+            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+    @property
+    def special_token_id(self) -&gt; int:
+        return self.card
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_q
+
+    def forward(self, sequence: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+
+        Args:
+            indices (torch.Tensor): Indices of the codes to model.
+            conditions (list of ConditioningAttributes): Conditions to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: Logits.
+        &#34;&#34;&#34;
+        B, K, S = sequence.shape
+        assert K == self.num_codebooks, &#34;Sequence shape must match the specified number of codebooks&#34;
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        if condition_tensors is None:
+            assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+            # apply dropout modules
+            conditions = self.cfg_dropout(conditions)
+            conditions = self.att_dropout(conditions)
+            tokenized = self.condition_provider.tokenize(conditions)
+            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+        out = self.transformer(input_, cross_attention_src=cross_attention_input)
+        if self.out_norm:
+            out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+            logits = logits[:, :, -S:]
+
+        return logits  # [B, K, S, card]
+
+    def compute_predictions(
+            self, codes: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+        &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            conditions (list of ConditioningAttributes): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        &#34;&#34;&#34;
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=True
+        )
+        # apply model on pattern sequence
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+        return LMOutput(logits, logits_mask)
+
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           cfg_conditions: CFGConditions,
+                           unconditional_state: State,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None,
+                           two_step_cfg: tp.Optional[bool] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coef (float, optional): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        &#34;&#34;&#34;
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if two_step_cfg and cfg_conditions != {}:
+            assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
+            condition_tensors, null_condition_tensors = cfg_conditions
+            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
+            state = self.get_streaming_state()
+            self.set_streaming_state(unconditional_state)
+            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
+            unconditional_state.update(self.get_streaming_state())
+            self.set_streaming_state(state)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
+        else:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting both conditional and unconditional logits.
+                sequence = torch.cat([sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+            else:
+                logits = all_logits
+
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+
+        # Apply softmax for sampling if temp &gt; 0. Else, do greedy sampling to avoid zero division error.
+        if use_sampling and temp &gt; 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p &gt; 0.0:
+                next_token = utils.sample_top_p(probs, p=top_p)
+            elif top_k &gt; 0:
+                next_token = utils.sample_top_k(probs, k=top_k)
+            else:
+                next_token = utils.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+        return next_token
+
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: tp.Optional[bool] = None,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+        Args:
+            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
+            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coeff (float, optional): Classifier-free guidance coefficient.
+            two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
+            remove_prompts (bool): Whether to remove prompts from generation or not.
+            check (bool): Whether to apply further checks on generated sequence.
+            callback (Callback, optional): Callback function to report generation progress.
+        Returns:
+            torch.Tensor: Generated tokens.
+        &#34;&#34;&#34;
+        assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsistent inputs shapes&#34;
+        num_samples = possible_num_samples[0]
+
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        # the reason for that is two-fold:
+        # 1. it is about x2 faster than doing 2 forward passes
+        # 2. avoid the streaming API treating the 2 passes as part of different time steps
+        # We also support doing two different passes, in particular to ensure that
+        # the padding structure is exactly the same between train and test.
+        # With a batch size of 1, this can be slower though.
+        cfg_conditions: CFGConditions
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            if two_step_cfg:
+                cfg_conditions = (
+                    self.condition_provider(self.condition_provider.tokenize(conditions)),
+                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                )
+            else:
+                conditions = conditions + null_conditions
+                tokenized = self.condition_provider.tokenize(conditions)
+                cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+
+        if prompt is None:
+            assert num_samples &gt; 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+        B, K, T = prompt.shape
+        start_offset = T
+        assert start_offset &lt; max_gen_len
+
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+
+        with self.streaming():
+            unconditional_state = self.get_streaming_state()
+            prev_offset = 0
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1]
+                )
+                prev_offset = offset
+                if callback is not None:
+                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+        unconditional_state.clear()
+
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        assert (
+            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+        ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        assert (out_mask[..., :max_gen_len] == 1).all()
+
+        out_start_offset = start_offset if remove_prompts else 0
+        out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+        # ensure the returned codes are all valid
+        assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+        return out_codes</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self) -&gt; int:
+    return self.n_q</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.special_token_id"><code class="name">var <span class="ident">special_token_id</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def special_token_id(self) -&gt; int:
+    return self.card</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.compute_predictions"><code class="name flex">
+<span>def <span class="ident">compute_predictions</span></span>(<span>self, codes: torch.Tensor, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>], condition_tensors: Optional[Dict[str, Tuple[torch.Tensor, torch.Tensor]]] = None) ‑> <a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+forward using the specified codes interleaving pattern.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>codes</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input codes of shape [B, K, T] with B the batch size,
+K the number of codebooks and T the number of timesteps.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>list</code> of <code>ConditioningAttributes</code></dt>
+<dd>conditionings to use when modeling
+the given codes. Note that when evaluating multiple time with the same conditioning
+you should pre-compute those and pass them as <code>condition_tensors</code>.</dd>
+<dt><strong><code>condition_tensors</code></strong> :&ensp;<code>dict[str, ConditionType]</code>, optional</dt>
+<dd>pre-computed conditioning
+tensors, see <code>conditions</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></code></dt>
+<dd>Language model outputs
+logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+i.e. the first item corresponds to logits to predict the first code, meaning that
+no additional shifting of codes and logits is required.
+mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+Given the specified interleaving strategies, parts of the logits and codes should
+not be considered as valid predictions because of invalid context.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute_predictions(
+        self, codes: torch.Tensor,
+        conditions: tp.List[ConditioningAttributes],
+        condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+    &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+    forward using the specified codes interleaving pattern.
+
+    Args:
+        codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+            K the number of codebooks and T the number of timesteps.
+        conditions (list of ConditioningAttributes): conditionings to use when modeling
+            the given codes. Note that when evaluating multiple time with the same conditioning
+            you should pre-compute those and pass them as `condition_tensors`.
+        condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
+            tensors, see `conditions`.
+    Returns:
+        LMOutput: Language model outputs
+            logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                i.e. the first item corresponds to logits to predict the first code, meaning that
+                no additional shifting of codes and logits is required.
+            mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                Given the specified interleaving strategies, parts of the logits and codes should
+                not be considered as valid predictions because of invalid context.
+    &#34;&#34;&#34;
+    B, K, T = codes.shape
+    codes = codes.contiguous()
+    # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+    pattern = self.pattern_provider.get_pattern(T)
+    sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+        codes, self.special_token_id, keep_only_valid_steps=True
+    )
+    # apply model on pattern sequence
+    model = self if self._fsdp is None else self._fsdp
+    logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+    # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+    # and provide the corresponding mask over invalid positions of tokens
+    logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+    # note: we use nans as special token to make it obvious if we feed unexpected logits
+    logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+        logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+    )
+    logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+    logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+    return LMOutput(logits, logits_mask)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, sequence: torch.Tensor, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>], condition_tensors: Optional[Dict[str, Tuple[torch.Tensor, torch.Tensor]]] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply language model on sequence and conditions.
+Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+S the sequence steps, return the logits with shape [B, card, K, S].</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>indices</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Indices of the codes to model.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>list</code> of <code>ConditioningAttributes</code></dt>
+<dd>Conditions to use when modeling
+the given codes. Note that when evaluating multiple time with the same conditioning
+you should pre-compute those and pass them as <code>condition_tensors</code>.</dd>
+<dt><strong><code>condition_tensors</code></strong> :&ensp;<code>dict[str, ConditionType]</code>, optional</dt>
+<dd>Pre-computed conditioning
+tensors, see <code>conditions</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Logits.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, sequence: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Apply language model on sequence and conditions.
+    Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+    S the sequence steps, return the logits with shape [B, card, K, S].
+
+    Args:
+        indices (torch.Tensor): Indices of the codes to model.
+        conditions (list of ConditioningAttributes): Conditions to use when modeling
+            the given codes. Note that when evaluating multiple time with the same conditioning
+            you should pre-compute those and pass them as `condition_tensors`.
+        condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+            tensors, see `conditions`.
+    Returns:
+        torch.Tensor: Logits.
+    &#34;&#34;&#34;
+    B, K, S = sequence.shape
+    assert K == self.num_codebooks, &#34;Sequence shape must match the specified number of codebooks&#34;
+    input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+    if condition_tensors is None:
+        assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+        # apply dropout modules
+        conditions = self.cfg_dropout(conditions)
+        conditions = self.att_dropout(conditions)
+        tokenized = self.condition_provider.tokenize(conditions)
+        # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+        condition_tensors = self.condition_provider(tokenized)
+    else:
+        assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+    input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+    out = self.transformer(input_, cross_attention_src=cross_attention_input)
+    if self.out_norm:
+        out = self.out_norm(out)
+    logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+    # remove the prefix from the model outputs
+    if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+        logits = logits[:, :, -S:]
+
+    return logits  # [B, K, S, card]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, prompt: Optional[torch.Tensor] = None, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>] = [], num_samples: Optional[int] = None, max_gen_len: int = 256, use_sampling: bool = True, temp: float = 1.0, top_k: int = 250, top_p: float = 0.0, cfg_coef: Optional[float] = None, two_step_cfg: Optional[bool] = None, remove_prompts: bool = False, check: bool = False, callback: Optional[Callable[[int, int], None]] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+be perform in a greedy fashion or using sampling with top K and top P strategies.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>prompt</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>Prompt tokens of shape [B, K, T].</dd>
+<dt><strong><code>conditions_tensors</code></strong> :&ensp;<code>list</code> of <code>ConditioningAttributes</code>, optional</dt>
+<dd>List of conditions.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Number of samples to generate when no prompt and no conditions are given.</dd>
+<dt><strong><code>max_gen_len</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum generation length.</dd>
+<dt><strong><code>use_sampling</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use a sampling strategy or not.</dd>
+<dt><strong><code>temp</code></strong> :&ensp;<code>float</code></dt>
+<dd>Sampling temperature.</dd>
+<dt><strong><code>top_k</code></strong> :&ensp;<code>int</code></dt>
+<dd>K for "top-k" sampling.</dd>
+<dt><strong><code>top_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>P for "top-p" sampling.</dd>
+<dt><strong><code>cfg_coeff</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Classifier-free guidance coefficient.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to perform classifier-free guidance with two steps generation.</dd>
+<dt><strong><code>remove_prompts</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to remove prompts from generation or not.</dd>
+<dt><strong><code>check</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to apply further checks on generated sequence.</dd>
+<dt><strong><code>callback</code></strong> :&ensp;<code>Callback</code>, optional</dt>
+<dd>Callback function to report generation progress.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Generated tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def generate(self,
+             prompt: tp.Optional[torch.Tensor] = None,
+             conditions: tp.List[ConditioningAttributes] = [],
+             num_samples: tp.Optional[int] = None,
+             max_gen_len: int = 256,
+             use_sampling: bool = True,
+             temp: float = 1.0,
+             top_k: int = 250,
+             top_p: float = 0.0,
+             cfg_coef: tp.Optional[float] = None,
+             two_step_cfg: tp.Optional[bool] = None,
+             remove_prompts: bool = False,
+             check: bool = False,
+             callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+    be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+    Args:
+        prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+        conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
+        num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
+        max_gen_len (int): Maximum generation length.
+        use_sampling (bool): Whether to use a sampling strategy or not.
+        temp (float): Sampling temperature.
+        top_k (int): K for &#34;top-k&#34; sampling.
+        top_p (float): P for &#34;top-p&#34; sampling.
+        cfg_coeff (float, optional): Classifier-free guidance coefficient.
+        two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
+        remove_prompts (bool): Whether to remove prompts from generation or not.
+        check (bool): Whether to apply further checks on generated sequence.
+        callback (Callback, optional): Callback function to report generation progress.
+    Returns:
+        torch.Tensor: Generated tokens.
+    &#34;&#34;&#34;
+    assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+    first_param = next(iter(self.parameters()))
+    device = first_param.device
+
+    # Checking all input shapes are consistent.
+    possible_num_samples = []
+    if num_samples is not None:
+        possible_num_samples.append(num_samples)
+    elif prompt is not None:
+        possible_num_samples.append(prompt.shape[0])
+    elif conditions:
+        possible_num_samples.append(len(conditions))
+    else:
+        possible_num_samples.append(1)
+    assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsistent inputs shapes&#34;
+    num_samples = possible_num_samples[0]
+
+    # below we create set of conditions: one conditional and one unconditional
+    # to do that we merge the regular condition together with the null condition
+    # we then do 1 forward pass instead of 2.
+    # the reason for that is two-fold:
+    # 1. it is about x2 faster than doing 2 forward passes
+    # 2. avoid the streaming API treating the 2 passes as part of different time steps
+    # We also support doing two different passes, in particular to ensure that
+    # the padding structure is exactly the same between train and test.
+    # With a batch size of 1, this can be slower though.
+    cfg_conditions: CFGConditions
+    two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+    if conditions:
+        null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+        if two_step_cfg:
+            cfg_conditions = (
+                self.condition_provider(self.condition_provider.tokenize(conditions)),
+                self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+            )
+        else:
+            conditions = conditions + null_conditions
+            tokenized = self.condition_provider.tokenize(conditions)
+            cfg_conditions = self.condition_provider(tokenized)
+    else:
+        cfg_conditions = {}
+
+    if prompt is None:
+        assert num_samples &gt; 0
+        prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+    B, K, T = prompt.shape
+    start_offset = T
+    assert start_offset &lt; max_gen_len
+
+    pattern = self.pattern_provider.get_pattern(max_gen_len)
+    # this token is used as default value for codes that are not generated yet
+    unknown_token = -1
+
+    # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+    gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+    # filling the gen_codes with the prompt if needed
+    gen_codes[..., :start_offset] = prompt
+    # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+    gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+    # retrieve the start_offset in the sequence:
+    # it is the first sequence step that contains the `start_offset` timestep
+    start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+    assert start_offset_sequence is not None
+
+    with self.streaming():
+        unconditional_state = self.get_streaming_state()
+        prev_offset = 0
+        gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+        for offset in range(start_offset_sequence, gen_sequence_len):
+            # get current sequence (note that the streaming API is providing the caching over previous offsets)
+            curr_sequence = gen_sequence[..., prev_offset:offset]
+            curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+            if check:
+                # check coherence between mask and sequence
+                assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                # should never happen as gen_sequence is filled progressively
+                assert not (curr_sequence == unknown_token).any()
+            # sample next token from the model, next token shape is [B, K, 1]
+            next_token = self._sample_next_token(
+                curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
+            # ensure the tokens that should be masked are properly set to special_token_id
+            # as the model never output special_token_id
+            valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+            next_token[~valid_mask] = self.special_token_id
+            # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+            # (then mask tokens should be left as is as well, which is correct)
+            gen_sequence[..., offset:offset+1] = torch.where(
+                gen_sequence[..., offset:offset+1] == unknown_token,
+                next_token, gen_sequence[..., offset:offset+1]
+            )
+            prev_offset = offset
+            if callback is not None:
+                callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+    unconditional_state.clear()
+
+    # ensure sequence has been entirely filled
+    assert not (gen_sequence == unknown_token).any()
+    # ensure gen_sequence pattern and mask are matching
+    # which means the gen_sequence is valid according to the pattern
+    assert (
+        gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+    ).all()
+    # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+    out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+    # sanity checks over the returned codes and corresponding masks
+    assert (out_codes[..., :max_gen_len] != unknown_token).all()
+    assert (out_mask[..., :max_gen_len] == 1).all()
+
+    out_start_offset = start_offset if remove_prompts else 0
+    out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+    # ensure the returned codes are all valid
+    assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+    return out_codes</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.lm.LMOutput"><code class="flex name class">
+<span>class <span class="ident">LMOutput</span></span>
+<span>(</span><span>logits: torch.Tensor, mask: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LMOutput(logits: torch.Tensor, mask: torch.Tensor)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMOutput.logits"><code class="name">var <span class="ident">logits</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMOutput.mask"><code class="name">var <span class="ident">mask</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding"><code class="flex name class">
+<span>class <span class="ident">ScaledEmbedding</span></span>
+<span>(</span><span>*args, lr=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Boost learning rate for embeddings (with <code>scale</code>).</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ScaledEmbedding(nn.Embedding):
+    &#34;&#34;&#34;Boost learning rate for embeddings (with `scale`).
+    &#34;&#34;&#34;
+    def __init__(self, *args, lr=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr = lr
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        return group</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.sparse.Embedding</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.ScaledEmbedding.embedding_dim"><code class="name">var <span class="ident">embedding_dim</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.freeze"><code class="name">var <span class="ident">freeze</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.max_norm"><code class="name">var <span class="ident">max_norm</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.norm_type"><code class="name">var <span class="ident">norm_type</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.num_embeddings"><code class="name">var <span class="ident">num_embeddings</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.padding_idx"><code class="name">var <span class="ident">padding_idx</span> : Optional[int]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq"><code class="name">var <span class="ident">scale_grad_by_freq</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.sparse"><code class="name">var <span class="ident">sparse</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.weight"><code class="name">var <span class="ident">weight</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.lm.ScaledEmbedding.make_optim_group"><code class="name flex">
+<span>def <span class="ident">make_optim_group</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def make_optim_group(self):
+    group = {&#34;params&#34;: list(self.parameters())}
+    if self.lr is not None:
+        group[&#34;lr&#34;] = self.lr
+    return group</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.lm.get_init_fn" href="#audiocraft.models.lm.get_init_fn">get_init_fn</a></code></li>
+<li><code><a title="audiocraft.models.lm.init_layer" href="#audiocraft.models.lm.init_layer">init_layer</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.lm.LMModel" href="#audiocraft.models.lm.LMModel">LMModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.lm.LMModel.call_super_init" href="#audiocraft.models.lm.LMModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.compute_predictions" href="#audiocraft.models.lm.LMModel.compute_predictions">compute_predictions</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.dump_patches" href="#audiocraft.models.lm.LMModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.forward" href="#audiocraft.models.lm.LMModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.generate" href="#audiocraft.models.lm.LMModel.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.num_codebooks" href="#audiocraft.models.lm.LMModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.special_token_id" href="#audiocraft.models.lm.LMModel.special_token_id">special_token_id</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.training" href="#audiocraft.models.lm.LMModel.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.lm.LMOutput.logits" href="#audiocraft.models.lm.LMOutput.logits">logits</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMOutput.mask" href="#audiocraft.models.lm.LMOutput.mask">mask</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.lm.ScaledEmbedding" href="#audiocraft.models.lm.ScaledEmbedding">ScaledEmbedding</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.embedding_dim" href="#audiocraft.models.lm.ScaledEmbedding.embedding_dim">embedding_dim</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.freeze" href="#audiocraft.models.lm.ScaledEmbedding.freeze">freeze</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.make_optim_group" href="#audiocraft.models.lm.ScaledEmbedding.make_optim_group">make_optim_group</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.max_norm" href="#audiocraft.models.lm.ScaledEmbedding.max_norm">max_norm</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.norm_type" href="#audiocraft.models.lm.ScaledEmbedding.norm_type">norm_type</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.num_embeddings" href="#audiocraft.models.lm.ScaledEmbedding.num_embeddings">num_embeddings</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.padding_idx" href="#audiocraft.models.lm.ScaledEmbedding.padding_idx">padding_idx</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq" href="#audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq">scale_grad_by_freq</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.sparse" href="#audiocraft.models.lm.ScaledEmbedding.sparse">sparse</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.weight" href="#audiocraft.models.lm.ScaledEmbedding.weight">weight</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/loaders.html b/api_docs/audiocraft/models/loaders.html
new file mode 100644
index 00000000..684cb8a7
--- /dev/null
+++ b/api_docs/audiocraft/models/loaders.html
@@ -0,0 +1,370 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.loaders API documentation</title>
+<meta name="description" content="Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- &#39;xp.cfg&#39;: the hydra config as dumped …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.loaders</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- 'xp.cfg': the hydra config as dumped during training. This should be used
+to rebuild the object using the audiocraft.models.builders functions,
+- 'model_best_state': a readily loadable best state for the model, including
+the conditioner. The model obtained from <code>xp.cfg</code> should be compatible
+with this state dict. In the case of a LM, the encodec model would not be
+bundled along but instead provided separately.</p>
+<p>Those functions also support loading from a remote location with the Torch Hub API.
+They also support overriding some parameters, in particular the device and dtype
+of the returned model.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- &#39;xp.cfg&#39;: the hydra config as dumped during training. This should be used
+    to rebuild the object using the audiocraft.models.builders functions,
+- &#39;model_best_state&#39;: a readily loadable best state for the model, including
+    the conditioner. The model obtained from `xp.cfg` should be compatible
+    with this state dict. In the case of a LM, the encodec model would not be
+    bundled along but instead provided separately.
+
+Those functions also support loading from a remote location with the Torch Hub API.
+They also support overriding some parameters, in particular the device and dtype
+of the returned model.
+&#34;&#34;&#34;
+
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+import typing as tp
+import os
+
+from omegaconf import OmegaConf, DictConfig
+import torch
+
+import audiocraft
+from . import builders
+from .encodec import CompressionModel
+
+
+def get_audiocraft_cache_dir() -&gt; tp.Optional[str]:
+    return os.environ.get(&#39;AUDIOCRAFT_CACHE_DIR&#39;, None)
+
+
+def _get_state_dict(
+    file_or_url_or_id: tp.Union[Path, str],
+    filename: tp.Optional[str] = None,
+    device=&#39;cpu&#39;,
+    cache_dir: tp.Optional[str] = None,
+):
+    if cache_dir is None:
+        cache_dir = get_audiocraft_cache_dir()
+    # Return the state dict either from a file or url
+    file_or_url_or_id = str(file_or_url_or_id)
+    assert isinstance(file_or_url_or_id, str)
+
+    if os.path.isfile(file_or_url_or_id):
+        return torch.load(file_or_url_or_id, map_location=device)
+
+    if os.path.isdir(file_or_url_or_id):
+        file = f&#34;{file_or_url_or_id}/{filename}&#34;
+        return torch.load(file, map_location=device)
+
+    elif file_or_url_or_id.startswith(&#39;https://&#39;):
+        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
+
+    else:
+        assert filename is not None, &#34;filename needs to be defined if using HF checkpoints&#34;
+
+        file = hf_hub_download(
+            repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir,
+            library_name=&#34;audiocraft&#34;, library_version=audiocraft.__version__)
+        return torch.load(file, map_location=device)
+
+
+def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=&#34;compression_state_dict.bin&#34;, cache_dir=cache_dir)
+
+
+def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    if &#39;pretrained&#39; in pkg:
+        return CompressionModel.get_pretrained(pkg[&#39;pretrained&#39;], device=device)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    return model
+
+
+def load_lm_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=&#34;state_dict.bin&#34;, cache_dir=cache_dir)
+
+
+def _delete_param(cfg: DictConfig, full_name: str):
+    parts = full_name.split(&#39;.&#39;)
+    for part in parts[:-1]:
+        if part in cfg:
+            cfg = cfg[part]
+        else:
+            return
+    OmegaConf.set_struct(cfg, False)
+    if parts[-1] in cfg:
+        del cfg[parts[-1]]
+    OmegaConf.set_struct(cfg, True)
+
+
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    if cfg.device == &#39;cpu&#39;:
+        cfg.dtype = &#39;float32&#39;
+    else:
+        cfg.dtype = &#39;float16&#39;
+    _delete_param(cfg, &#39;conditioners.self_wav.chroma_stem.cache_path&#39;)
+    _delete_param(cfg, &#39;conditioners.args.merge_text_conditions_p&#39;)
+    _delete_param(cfg, &#39;conditioners.args.drop_desc_p&#39;)
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    model.cfg = cfg
+    return model
+
+
+def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
+                  filename: tp.Optional[str] = None,
+                  cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+
+
+def load_diffusion_models(file_or_url_or_id: tp.Union[Path, str],
+                          device=&#39;cpu&#39;,
+                          filename: tp.Optional[str] = None,
+                          cache_dir: tp.Optional[str] = None):
+    pkg = load_mbd_ckpt(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+    models = []
+    processors = []
+    cfgs = []
+    sample_rate = pkg[&#39;sample_rate&#39;]
+    for i in range(pkg[&#39;n_bands&#39;]):
+        cfg = pkg[i][&#39;cfg&#39;]
+        model = builders.get_diffusion_model(cfg)
+        model_dict = pkg[i][&#39;model_state&#39;]
+        model.load_state_dict(model_dict)
+        model.to(device)
+        processor = builders.get_processor(cfg=cfg.processor, sample_rate=sample_rate)
+        processor_dict = pkg[i][&#39;processor_state&#39;]
+        processor.load_state_dict(processor_dict)
+        processor.to(device)
+        models.append(model)
+        processors.append(processor)
+        cfgs.append(cfg)
+    return models, processors, cfgs</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.loaders.get_audiocraft_cache_dir"><code class="name flex">
+<span>def <span class="ident">get_audiocraft_cache_dir</span></span>(<span>) ‑> Optional[str]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_audiocraft_cache_dir() -&gt; tp.Optional[str]:
+    return os.environ.get(&#39;AUDIOCRAFT_CACHE_DIR&#39;, None)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_compression_model"><code class="name flex">
+<span>def <span class="ident">load_compression_model</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], device='cpu', cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    if &#39;pretrained&#39; in pkg:
+        return CompressionModel.get_pretrained(pkg[&#39;pretrained&#39;], device=device)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    return model</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_compression_model_ckpt"><code class="name flex">
+<span>def <span class="ident">load_compression_model_ckpt</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=&#34;compression_state_dict.bin&#34;, cache_dir=cache_dir)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_diffusion_models"><code class="name flex">
+<span>def <span class="ident">load_diffusion_models</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], device='cpu', filename: Optional[str] = None, cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_diffusion_models(file_or_url_or_id: tp.Union[Path, str],
+                          device=&#39;cpu&#39;,
+                          filename: tp.Optional[str] = None,
+                          cache_dir: tp.Optional[str] = None):
+    pkg = load_mbd_ckpt(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+    models = []
+    processors = []
+    cfgs = []
+    sample_rate = pkg[&#39;sample_rate&#39;]
+    for i in range(pkg[&#39;n_bands&#39;]):
+        cfg = pkg[i][&#39;cfg&#39;]
+        model = builders.get_diffusion_model(cfg)
+        model_dict = pkg[i][&#39;model_state&#39;]
+        model.load_state_dict(model_dict)
+        model.to(device)
+        processor = builders.get_processor(cfg=cfg.processor, sample_rate=sample_rate)
+        processor_dict = pkg[i][&#39;processor_state&#39;]
+        processor.load_state_dict(processor_dict)
+        processor.to(device)
+        models.append(model)
+        processors.append(processor)
+        cfgs.append(cfg)
+    return models, processors, cfgs</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_lm_model"><code class="name flex">
+<span>def <span class="ident">load_lm_model</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], device='cpu', cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    if cfg.device == &#39;cpu&#39;:
+        cfg.dtype = &#39;float32&#39;
+    else:
+        cfg.dtype = &#39;float16&#39;
+    _delete_param(cfg, &#39;conditioners.self_wav.chroma_stem.cache_path&#39;)
+    _delete_param(cfg, &#39;conditioners.args.merge_text_conditions_p&#39;)
+    _delete_param(cfg, &#39;conditioners.args.drop_desc_p&#39;)
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    model.cfg = cfg
+    return model</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_lm_model_ckpt"><code class="name flex">
+<span>def <span class="ident">load_lm_model_ckpt</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_lm_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=&#34;state_dict.bin&#34;, cache_dir=cache_dir)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_mbd_ckpt"><code class="name flex">
+<span>def <span class="ident">load_mbd_ckpt</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], filename: Optional[str] = None, cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
+                  filename: tp.Optional[str] = None,
+                  cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=filename, cache_dir=cache_dir)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.loaders.get_audiocraft_cache_dir" href="#audiocraft.models.loaders.get_audiocraft_cache_dir">get_audiocraft_cache_dir</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_compression_model" href="#audiocraft.models.loaders.load_compression_model">load_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_compression_model_ckpt" href="#audiocraft.models.loaders.load_compression_model_ckpt">load_compression_model_ckpt</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_diffusion_models" href="#audiocraft.models.loaders.load_diffusion_models">load_diffusion_models</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_lm_model" href="#audiocraft.models.loaders.load_lm_model">load_lm_model</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_lm_model_ckpt" href="#audiocraft.models.loaders.load_lm_model_ckpt">load_lm_model_ckpt</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_mbd_ckpt" href="#audiocraft.models.loaders.load_mbd_ckpt">load_mbd_ckpt</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/multibanddiffusion.html b/api_docs/audiocraft/models/multibanddiffusion.html
new file mode 100644
index 00000000..3b8552c1
--- /dev/null
+++ b/api_docs/audiocraft/models/multibanddiffusion.html
@@ -0,0 +1,812 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.multibanddiffusion API documentation</title>
+<meta name="description" content="Multi Band Diffusion models as described in
+&#34;From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion&#34;
+(paper link)." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.multibanddiffusion</code></h1>
+</header>
+<section id="section-intro">
+<p>Multi Band Diffusion models as described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Multi Band Diffusion models as described in
+&#34;From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion&#34;
+(paper link).
+&#34;&#34;&#34;
+
+import typing as tp
+
+import torch
+import julius
+
+from .unet import DiffusionUnet
+from ..modules.diffusion_schedule import NoiseSchedule
+from .encodec import CompressionModel
+from ..solvers.compression import CompressionSolver
+from .loaders import load_compression_model, load_diffusion_models
+
+
+class DiffusionProcess:
+    &#34;&#34;&#34;Sampling for a diffusion Model.
+
+    Args:
+        model (DiffusionUnet): Diffusion U-Net model.
+        noise_schedule (NoiseSchedule): Noise schedule for diffusion process.
+    &#34;&#34;&#34;
+    def __init__(self, model: DiffusionUnet, noise_schedule: NoiseSchedule) -&gt; None:
+        &#34;&#34;&#34;
+        &#34;&#34;&#34;
+        self.model = model
+        self.schedule = noise_schedule
+
+    def generate(self, condition: torch.Tensor, initial_noise: torch.Tensor,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        &#34;&#34;&#34;Perform one diffusion process to generate one of the bands.
+
+        Args:
+            condition (tensor): The embeddings form the compression model.
+            initial_noise (tensor): The initial noise to start the process/
+        &#34;&#34;&#34;
+        return self.schedule.generate_subsampled(model=self.model, initial=initial_noise, step_list=step_list,
+                                                 condition=condition)
+
+
+class MultiBandDiffusion:
+    &#34;&#34;&#34;Sample from multiple diffusion models.
+
+    Args:
+        DPs (list of DiffusionProcess): Diffusion processes.
+        codec_model (CompressionModel): Underlying compression model used to obtain discrete tokens.
+    &#34;&#34;&#34;
+    def __init__(self, DPs: tp.List[DiffusionProcess], codec_model: CompressionModel) -&gt; None:
+        self.DPs = DPs
+        self.codec_model = codec_model
+        self.device = next(self.codec_model.parameters()).device
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.codec_model.sample_rate
+
+    @staticmethod
+    def get_mbd_musicgen(device=None):
+        &#34;&#34;&#34;Load our diffusion models trained for MusicGen.&#34;&#34;&#34;
+        if device is None:
+            device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+        path = &#39;facebook/multiband-diffusion&#39;
+        filename = &#39;mbd_musicgen_32khz.th&#39;
+        name = &#39;facebook/musicgen-small&#39;
+        codec_model = load_compression_model(name, device=device)
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+
+    @staticmethod
+    def get_mbd_24khz(bw: float = 3.0, pretrained: bool = True,
+                      device: tp.Optional[tp.Union[torch.device, str]] = None,
+                      n_q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get the pretrained Models for MultibandDiffusion.
+
+        Args:
+            bw (float): Bandwidth of the compression model.
+            pretrained (bool): Whether to use / download if necessary the models.
+            device (torch.device or str, optional): Device on which the models are loaded.
+            n_q (int, optional): Number of quantizers to use within the compression model.
+        &#34;&#34;&#34;
+        if device is None:
+            device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+        assert bw in [1.5, 3.0, 6.0], f&#34;bandwidth {bw} not available&#34;
+        if n_q is not None:
+            assert n_q in [2, 4, 8]
+            assert {1.5: 2, 3.0: 4, 6.0: 8}[bw] == n_q, \
+                f&#34;bandwidth and number of codebooks missmatch to use n_q = {n_q} bw should be {n_q * (1.5 / 2)}&#34;
+        n_q = {1.5: 2, 3.0: 4, 6.0: 8}[bw]
+        codec_model = CompressionSolver.model_from_checkpoint(
+            &#39;//pretrained/facebook/encodec_24khz&#39;, device=device)
+        codec_model.set_num_codebooks(n_q)
+        codec_model = codec_model.to(device)
+        path = &#39;facebook/multiband-diffusion&#39;
+        filename = f&#39;mbd_comp_{n_q}.pt&#39;
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+
+        return MultiBandDiffusion(DPs, codec_model)
+
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get the conditioning (i.e. latent reprentatios of the compression model) from a waveform.
+        Args:
+            wav (torch.Tensor): The audio that we want to extract the conditioning from
+            sample_rate (int): sample rate of the audio&#34;&#34;&#34;
+        if sample_rate != self.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.sample_rate)
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, &#34;Scaled compression models not supported.&#34;
+        emb = self.get_emb(codes)
+        return emb
+
+    @torch.no_grad()
+    def get_emb(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Get latent representation from the discrete codes
+        Argrs:
+            codes (torch.Tensor): discrete tokens&#34;&#34;&#34;
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+
+    def generate(self, emb: torch.Tensor, size: tp.Optional[torch.Size] = None,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        &#34;&#34;&#34;Generate Wavform audio from the latent embeddings of the compression model
+        Args:
+            emb (torch.Tensor): Conditioning embeddinds
+            size (none torch.Size): size of the output
+                if None this is computed from the typical upsampling of the model
+            step_list (optional list[int]): list of Markov chain steps, defaults to 50 linearly spaced step.
+        &#34;&#34;&#34;
+        if size is None:
+            upsampling = int(self.codec_model.sample_rate / self.codec_model.frame_rate)
+            size = torch.Size([emb.size(0), self.codec_model.channels, emb.size(-1) * upsampling])
+        assert size[0] == emb.size(0)
+        out = torch.zeros(size).to(self.device)
+        for DP in self.DPs:
+            out += DP.generate(condition=emb, step_list=step_list, initial_noise=torch.randn_like(out))
+        return out
+
+    def re_eq(self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1):
+        &#34;&#34;&#34;match the eq to the encodec output by matching the standard deviation of some frequency bands
+        Args:
+            wav (torch.Tensor): audio to equalize
+            ref (torch.Tensor):refenrence audio from which we match the spectrogram.
+            n_bands (int): number of bands of the eq
+            strictness (float): how strict the the matching. 0 is no matching, 1 is exact matching.
+        &#34;&#34;&#34;
+        split = julius.SplitBands(n_bands=n_bands, sample_rate=self.codec_model.sample_rate).to(wav.device)
+        bands = split(wav)
+        bands_ref = split(ref)
+        out = torch.zeros_like(ref)
+        for i in range(n_bands):
+            out += bands[i] * (bands_ref[i].std() / bands[i].std()) ** strictness
+        return out
+
+    def regenerate(self, wav: torch.Tensor, sample_rate: int):
+        &#34;&#34;&#34;Regenerate a wavform through compression and diffusion regeneration.
+        Args:
+            wav (torch.Tensor): Original &#39;ground truth&#39; audio
+            sample_rate (int): sample rate of the input (and output) wav
+        &#34;&#34;&#34;
+        if sample_rate != self.codec_model.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.codec_model.sample_rate)
+        emb = self.get_condition(wav, sample_rate=self.codec_model.sample_rate)
+        size = wav.size()
+        out = self.generate(emb, size=size)
+        if sample_rate != self.codec_model.sample_rate:
+            out = julius.resample_frac(out, self.codec_model.sample_rate, sample_rate)
+        return out
+
+    def tokens_to_wav(self, tokens: torch.Tensor, n_bands: int = 32):
+        &#34;&#34;&#34;Generate Waveform audio with diffusion from the discrete codes.
+        Args:
+            tokens (torch.Tensor): discrete codes
+            n_bands (int): bands for the eq matching.
+        &#34;&#34;&#34;
+        wav_encodec = self.codec_model.decode(tokens)
+        condition = self.get_emb(tokens)
+        wav_diffusion = self.generate(emb=condition, size=wav_encodec.size())
+        return self.re_eq(wav=wav_diffusion, ref=wav_encodec, n_bands=n_bands)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.multibanddiffusion.DiffusionProcess"><code class="flex name class">
+<span>class <span class="ident">DiffusionProcess</span></span>
+<span>(</span><span>model: <a title="audiocraft.models.unet.DiffusionUnet" href="unet.html#audiocraft.models.unet.DiffusionUnet">DiffusionUnet</a>, noise_schedule: <a title="audiocraft.modules.diffusion_schedule.NoiseSchedule" href="../modules/diffusion_schedule.html#audiocraft.modules.diffusion_schedule.NoiseSchedule">NoiseSchedule</a>)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sampling for a diffusion Model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>model</code></strong> :&ensp;<code>DiffusionUnet</code></dt>
+<dd>Diffusion U-Net model.</dd>
+<dt><strong><code>noise_schedule</code></strong> :&ensp;<code>NoiseSchedule</code></dt>
+<dd>Noise schedule for diffusion process.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DiffusionProcess:
+    &#34;&#34;&#34;Sampling for a diffusion Model.
+
+    Args:
+        model (DiffusionUnet): Diffusion U-Net model.
+        noise_schedule (NoiseSchedule): Noise schedule for diffusion process.
+    &#34;&#34;&#34;
+    def __init__(self, model: DiffusionUnet, noise_schedule: NoiseSchedule) -&gt; None:
+        &#34;&#34;&#34;
+        &#34;&#34;&#34;
+        self.model = model
+        self.schedule = noise_schedule
+
+    def generate(self, condition: torch.Tensor, initial_noise: torch.Tensor,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        &#34;&#34;&#34;Perform one diffusion process to generate one of the bands.
+
+        Args:
+            condition (tensor): The embeddings form the compression model.
+            initial_noise (tensor): The initial noise to start the process/
+        &#34;&#34;&#34;
+        return self.schedule.generate_subsampled(model=self.model, initial=initial_noise, step_list=step_list,
+                                                 condition=condition)</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.multibanddiffusion.DiffusionProcess.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, condition: torch.Tensor, initial_noise: torch.Tensor, step_list: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Perform one diffusion process to generate one of the bands.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>condition</code></strong> :&ensp;<code>tensor</code></dt>
+<dd>The embeddings form the compression model.</dd>
+<dt><strong><code>initial_noise</code></strong> :&ensp;<code>tensor</code></dt>
+<dd>The initial noise to start the process/</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, condition: torch.Tensor, initial_noise: torch.Tensor,
+             step_list: tp.Optional[tp.List[int]] = None):
+    &#34;&#34;&#34;Perform one diffusion process to generate one of the bands.
+
+    Args:
+        condition (tensor): The embeddings form the compression model.
+        initial_noise (tensor): The initial noise to start the process/
+    &#34;&#34;&#34;
+    return self.schedule.generate_subsampled(model=self.model, initial=initial_noise, step_list=step_list,
+                                             condition=condition)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion"><code class="flex name class">
+<span>class <span class="ident">MultiBandDiffusion</span></span>
+<span>(</span><span>DPs: List[<a title="audiocraft.models.multibanddiffusion.DiffusionProcess" href="#audiocraft.models.multibanddiffusion.DiffusionProcess">DiffusionProcess</a>], codec_model: <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a>)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample from multiple diffusion models.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>DPs</code></strong> :&ensp;<code>list</code> of <code><a title="audiocraft.models.multibanddiffusion.DiffusionProcess" href="#audiocraft.models.multibanddiffusion.DiffusionProcess">DiffusionProcess</a></code></dt>
+<dd>Diffusion processes.</dd>
+<dt><strong><code>codec_model</code></strong> :&ensp;<code>CompressionModel</code></dt>
+<dd>Underlying compression model used to obtain discrete tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiBandDiffusion:
+    &#34;&#34;&#34;Sample from multiple diffusion models.
+
+    Args:
+        DPs (list of DiffusionProcess): Diffusion processes.
+        codec_model (CompressionModel): Underlying compression model used to obtain discrete tokens.
+    &#34;&#34;&#34;
+    def __init__(self, DPs: tp.List[DiffusionProcess], codec_model: CompressionModel) -&gt; None:
+        self.DPs = DPs
+        self.codec_model = codec_model
+        self.device = next(self.codec_model.parameters()).device
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.codec_model.sample_rate
+
+    @staticmethod
+    def get_mbd_musicgen(device=None):
+        &#34;&#34;&#34;Load our diffusion models trained for MusicGen.&#34;&#34;&#34;
+        if device is None:
+            device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+        path = &#39;facebook/multiband-diffusion&#39;
+        filename = &#39;mbd_musicgen_32khz.th&#39;
+        name = &#39;facebook/musicgen-small&#39;
+        codec_model = load_compression_model(name, device=device)
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+
+    @staticmethod
+    def get_mbd_24khz(bw: float = 3.0, pretrained: bool = True,
+                      device: tp.Optional[tp.Union[torch.device, str]] = None,
+                      n_q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get the pretrained Models for MultibandDiffusion.
+
+        Args:
+            bw (float): Bandwidth of the compression model.
+            pretrained (bool): Whether to use / download if necessary the models.
+            device (torch.device or str, optional): Device on which the models are loaded.
+            n_q (int, optional): Number of quantizers to use within the compression model.
+        &#34;&#34;&#34;
+        if device is None:
+            device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+        assert bw in [1.5, 3.0, 6.0], f&#34;bandwidth {bw} not available&#34;
+        if n_q is not None:
+            assert n_q in [2, 4, 8]
+            assert {1.5: 2, 3.0: 4, 6.0: 8}[bw] == n_q, \
+                f&#34;bandwidth and number of codebooks missmatch to use n_q = {n_q} bw should be {n_q * (1.5 / 2)}&#34;
+        n_q = {1.5: 2, 3.0: 4, 6.0: 8}[bw]
+        codec_model = CompressionSolver.model_from_checkpoint(
+            &#39;//pretrained/facebook/encodec_24khz&#39;, device=device)
+        codec_model.set_num_codebooks(n_q)
+        codec_model = codec_model.to(device)
+        path = &#39;facebook/multiband-diffusion&#39;
+        filename = f&#39;mbd_comp_{n_q}.pt&#39;
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+
+        return MultiBandDiffusion(DPs, codec_model)
+
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get the conditioning (i.e. latent reprentatios of the compression model) from a waveform.
+        Args:
+            wav (torch.Tensor): The audio that we want to extract the conditioning from
+            sample_rate (int): sample rate of the audio&#34;&#34;&#34;
+        if sample_rate != self.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.sample_rate)
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, &#34;Scaled compression models not supported.&#34;
+        emb = self.get_emb(codes)
+        return emb
+
+    @torch.no_grad()
+    def get_emb(self, codes: torch.Tensor):
+        &#34;&#34;&#34;Get latent representation from the discrete codes
+        Argrs:
+            codes (torch.Tensor): discrete tokens&#34;&#34;&#34;
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+
+    def generate(self, emb: torch.Tensor, size: tp.Optional[torch.Size] = None,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        &#34;&#34;&#34;Generate Wavform audio from the latent embeddings of the compression model
+        Args:
+            emb (torch.Tensor): Conditioning embeddinds
+            size (none torch.Size): size of the output
+                if None this is computed from the typical upsampling of the model
+            step_list (optional list[int]): list of Markov chain steps, defaults to 50 linearly spaced step.
+        &#34;&#34;&#34;
+        if size is None:
+            upsampling = int(self.codec_model.sample_rate / self.codec_model.frame_rate)
+            size = torch.Size([emb.size(0), self.codec_model.channels, emb.size(-1) * upsampling])
+        assert size[0] == emb.size(0)
+        out = torch.zeros(size).to(self.device)
+        for DP in self.DPs:
+            out += DP.generate(condition=emb, step_list=step_list, initial_noise=torch.randn_like(out))
+        return out
+
+    def re_eq(self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1):
+        &#34;&#34;&#34;match the eq to the encodec output by matching the standard deviation of some frequency bands
+        Args:
+            wav (torch.Tensor): audio to equalize
+            ref (torch.Tensor):refenrence audio from which we match the spectrogram.
+            n_bands (int): number of bands of the eq
+            strictness (float): how strict the the matching. 0 is no matching, 1 is exact matching.
+        &#34;&#34;&#34;
+        split = julius.SplitBands(n_bands=n_bands, sample_rate=self.codec_model.sample_rate).to(wav.device)
+        bands = split(wav)
+        bands_ref = split(ref)
+        out = torch.zeros_like(ref)
+        for i in range(n_bands):
+            out += bands[i] * (bands_ref[i].std() / bands[i].std()) ** strictness
+        return out
+
+    def regenerate(self, wav: torch.Tensor, sample_rate: int):
+        &#34;&#34;&#34;Regenerate a wavform through compression and diffusion regeneration.
+        Args:
+            wav (torch.Tensor): Original &#39;ground truth&#39; audio
+            sample_rate (int): sample rate of the input (and output) wav
+        &#34;&#34;&#34;
+        if sample_rate != self.codec_model.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.codec_model.sample_rate)
+        emb = self.get_condition(wav, sample_rate=self.codec_model.sample_rate)
+        size = wav.size()
+        out = self.generate(emb, size=size)
+        if sample_rate != self.codec_model.sample_rate:
+            out = julius.resample_frac(out, self.codec_model.sample_rate, sample_rate)
+        return out
+
+    def tokens_to_wav(self, tokens: torch.Tensor, n_bands: int = 32):
+        &#34;&#34;&#34;Generate Waveform audio with diffusion from the discrete codes.
+        Args:
+            tokens (torch.Tensor): discrete codes
+            n_bands (int): bands for the eq matching.
+        &#34;&#34;&#34;
+        wav_encodec = self.codec_model.decode(tokens)
+        condition = self.get_emb(tokens)
+        wav_diffusion = self.generate(emb=condition, size=wav_encodec.size())
+        return self.re_eq(wav=wav_diffusion, ref=wav_encodec, n_bands=n_bands)</code></pre>
+</details>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_24khz"><code class="name flex">
+<span>def <span class="ident">get_mbd_24khz</span></span>(<span>bw: float = 3.0, pretrained: bool = True, device: Union[str, torch.device, None] = None, n_q: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get the pretrained Models for MultibandDiffusion.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>bw</code></strong> :&ensp;<code>float</code></dt>
+<dd>Bandwidth of the compression model.</dd>
+<dt><strong><code>pretrained</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use / download if necessary the models.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code>, optional</dt>
+<dd>Device on which the models are loaded.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Number of quantizers to use within the compression model.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_mbd_24khz(bw: float = 3.0, pretrained: bool = True,
+                  device: tp.Optional[tp.Union[torch.device, str]] = None,
+                  n_q: tp.Optional[int] = None):
+    &#34;&#34;&#34;Get the pretrained Models for MultibandDiffusion.
+
+    Args:
+        bw (float): Bandwidth of the compression model.
+        pretrained (bool): Whether to use / download if necessary the models.
+        device (torch.device or str, optional): Device on which the models are loaded.
+        n_q (int, optional): Number of quantizers to use within the compression model.
+    &#34;&#34;&#34;
+    if device is None:
+        device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+    assert bw in [1.5, 3.0, 6.0], f&#34;bandwidth {bw} not available&#34;
+    if n_q is not None:
+        assert n_q in [2, 4, 8]
+        assert {1.5: 2, 3.0: 4, 6.0: 8}[bw] == n_q, \
+            f&#34;bandwidth and number of codebooks missmatch to use n_q = {n_q} bw should be {n_q * (1.5 / 2)}&#34;
+    n_q = {1.5: 2, 3.0: 4, 6.0: 8}[bw]
+    codec_model = CompressionSolver.model_from_checkpoint(
+        &#39;//pretrained/facebook/encodec_24khz&#39;, device=device)
+    codec_model.set_num_codebooks(n_q)
+    codec_model = codec_model.to(device)
+    path = &#39;facebook/multiband-diffusion&#39;
+    filename = f&#39;mbd_comp_{n_q}.pt&#39;
+    models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+    DPs = []
+    for i in range(len(models)):
+        schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+        DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+    return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+
+    return MultiBandDiffusion(DPs, codec_model)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_musicgen"><code class="name flex">
+<span>def <span class="ident">get_mbd_musicgen</span></span>(<span>device=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Load our diffusion models trained for MusicGen.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_mbd_musicgen(device=None):
+    &#34;&#34;&#34;Load our diffusion models trained for MusicGen.&#34;&#34;&#34;
+    if device is None:
+        device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
+    path = &#39;facebook/multiband-diffusion&#39;
+    filename = &#39;mbd_musicgen_32khz.th&#39;
+    name = &#39;facebook/musicgen-small&#39;
+    codec_model = load_compression_model(name, device=device)
+    models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+    DPs = []
+    for i in range(len(models)):
+        schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+        DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+    return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    return self.codec_model.sample_rate</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, emb: torch.Tensor, size: Optional[torch.Size] = None, step_list: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate Wavform audio from the latent embeddings of the compression model</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>emb</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Conditioning embeddinds</dd>
+<dt><strong><code>size</code></strong> :&ensp;<code>none torch.Size</code></dt>
+<dd>size of the output
+if None this is computed from the typical upsampling of the model</dd>
+<dt><strong><code>step_list</code></strong> :&ensp;<code>optional list[int]</code></dt>
+<dd>list of Markov chain steps, defaults to 50 linearly spaced step.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, emb: torch.Tensor, size: tp.Optional[torch.Size] = None,
+             step_list: tp.Optional[tp.List[int]] = None):
+    &#34;&#34;&#34;Generate Wavform audio from the latent embeddings of the compression model
+    Args:
+        emb (torch.Tensor): Conditioning embeddinds
+        size (none torch.Size): size of the output
+            if None this is computed from the typical upsampling of the model
+        step_list (optional list[int]): list of Markov chain steps, defaults to 50 linearly spaced step.
+    &#34;&#34;&#34;
+    if size is None:
+        upsampling = int(self.codec_model.sample_rate / self.codec_model.frame_rate)
+        size = torch.Size([emb.size(0), self.codec_model.channels, emb.size(-1) * upsampling])
+    assert size[0] == emb.size(0)
+    out = torch.zeros(size).to(self.device)
+    for DP in self.DPs:
+        out += DP.generate(condition=emb, step_list=step_list, initial_noise=torch.randn_like(out))
+    return out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_condition"><code class="name flex">
+<span>def <span class="ident">get_condition</span></span>(<span>self, wav: torch.Tensor, sample_rate: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get the conditioning (i.e. latent reprentatios of the compression model) from a waveform.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>The audio that we want to extract the conditioning from</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate of the audio</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def get_condition(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Get the conditioning (i.e. latent reprentatios of the compression model) from a waveform.
+    Args:
+        wav (torch.Tensor): The audio that we want to extract the conditioning from
+        sample_rate (int): sample rate of the audio&#34;&#34;&#34;
+    if sample_rate != self.sample_rate:
+        wav = julius.resample_frac(wav, sample_rate, self.sample_rate)
+    codes, scale = self.codec_model.encode(wav)
+    assert scale is None, &#34;Scaled compression models not supported.&#34;
+    emb = self.get_emb(codes)
+    return emb</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_emb"><code class="name flex">
+<span>def <span class="ident">get_emb</span></span>(<span>self, codes: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get latent representation from the discrete codes</p>
+<h2 id="argrs">Argrs</h2>
+<p>codes (torch.Tensor): discrete tokens</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def get_emb(self, codes: torch.Tensor):
+    &#34;&#34;&#34;Get latent representation from the discrete codes
+    Argrs:
+        codes (torch.Tensor): discrete tokens&#34;&#34;&#34;
+    emb = self.codec_model.decode_latent(codes)
+    return emb</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.re_eq"><code class="name flex">
+<span>def <span class="ident">re_eq</span></span>(<span>self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>match the eq to the encodec output by matching the standard deviation of some frequency bands</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>audio to equalize</dd>
+<dt>ref (torch.Tensor):refenrence audio from which we match the spectrogram.</dt>
+<dt><strong><code>n_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of bands of the eq</dd>
+<dt><strong><code>strictness</code></strong> :&ensp;<code>float</code></dt>
+<dd>how strict the the matching. 0 is no matching, 1 is exact matching.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def re_eq(self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1):
+    &#34;&#34;&#34;match the eq to the encodec output by matching the standard deviation of some frequency bands
+    Args:
+        wav (torch.Tensor): audio to equalize
+        ref (torch.Tensor):refenrence audio from which we match the spectrogram.
+        n_bands (int): number of bands of the eq
+        strictness (float): how strict the the matching. 0 is no matching, 1 is exact matching.
+    &#34;&#34;&#34;
+    split = julius.SplitBands(n_bands=n_bands, sample_rate=self.codec_model.sample_rate).to(wav.device)
+    bands = split(wav)
+    bands_ref = split(ref)
+    out = torch.zeros_like(ref)
+    for i in range(n_bands):
+        out += bands[i] * (bands_ref[i].std() / bands[i].std()) ** strictness
+    return out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.regenerate"><code class="name flex">
+<span>def <span class="ident">regenerate</span></span>(<span>self, wav: torch.Tensor, sample_rate: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Regenerate a wavform through compression and diffusion regeneration.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Original 'ground truth' audio</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate of the input (and output) wav</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def regenerate(self, wav: torch.Tensor, sample_rate: int):
+    &#34;&#34;&#34;Regenerate a wavform through compression and diffusion regeneration.
+    Args:
+        wav (torch.Tensor): Original &#39;ground truth&#39; audio
+        sample_rate (int): sample rate of the input (and output) wav
+    &#34;&#34;&#34;
+    if sample_rate != self.codec_model.sample_rate:
+        wav = julius.resample_frac(wav, sample_rate, self.codec_model.sample_rate)
+    emb = self.get_condition(wav, sample_rate=self.codec_model.sample_rate)
+    size = wav.size()
+    out = self.generate(emb, size=size)
+    if sample_rate != self.codec_model.sample_rate:
+        out = julius.resample_frac(out, self.codec_model.sample_rate, sample_rate)
+    return out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.multibanddiffusion.MultiBandDiffusion.tokens_to_wav"><code class="name flex">
+<span>def <span class="ident">tokens_to_wav</span></span>(<span>self, tokens: torch.Tensor, n_bands: int = 32)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate Waveform audio with diffusion from the discrete codes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>tokens</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>discrete codes</dd>
+<dt><strong><code>n_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>bands for the eq matching.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokens_to_wav(self, tokens: torch.Tensor, n_bands: int = 32):
+    &#34;&#34;&#34;Generate Waveform audio with diffusion from the discrete codes.
+    Args:
+        tokens (torch.Tensor): discrete codes
+        n_bands (int): bands for the eq matching.
+    &#34;&#34;&#34;
+    wav_encodec = self.codec_model.decode(tokens)
+    condition = self.get_emb(tokens)
+    wav_diffusion = self.generate(emb=condition, size=wav_encodec.size())
+    return self.re_eq(wav=wav_diffusion, ref=wav_encodec, n_bands=n_bands)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.multibanddiffusion.DiffusionProcess" href="#audiocraft.models.multibanddiffusion.DiffusionProcess">DiffusionProcess</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.multibanddiffusion.DiffusionProcess.generate" href="#audiocraft.models.multibanddiffusion.DiffusionProcess.generate">generate</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion">MultiBandDiffusion</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.generate" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_condition" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_condition">get_condition</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_emb" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_emb">get_emb</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_24khz" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_24khz">get_mbd_24khz</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_musicgen" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.get_mbd_musicgen">get_mbd_musicgen</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.re_eq" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.re_eq">re_eq</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.regenerate" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.regenerate">regenerate</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.sample_rate" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.multibanddiffusion.MultiBandDiffusion.tokens_to_wav" href="#audiocraft.models.multibanddiffusion.MultiBandDiffusion.tokens_to_wav">tokens_to_wav</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/musicgen.html b/api_docs/audiocraft/models/musicgen.html
new file mode 100644
index 00000000..311e89db
--- /dev/null
+++ b/api_docs/audiocraft/models/musicgen.html
@@ -0,0 +1,1290 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.musicgen API documentation</title>
+<meta name="description" content="Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.musicgen</code></h1>
+</header>
+<section id="section-intro">
+<p>Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.
+&#34;&#34;&#34;
+
+import typing as tp
+import warnings
+
+import omegaconf
+import torch
+
+from .encodec import CompressionModel
+from .lm import LMModel
+from .builders import get_debug_compression_model, get_debug_lm_model, get_wrapped_compression_model
+from .loaders import load_compression_model, load_lm_model
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes, WavCondition
+from ..utils.autocast import TorchAutocast
+
+
+MelodyList = tp.List[tp.Optional[torch.Tensor]]
+MelodyType = tp.Union[torch.Tensor, MelodyList]
+
+
+# backward compatible names mapping
+_HF_MODEL_CHECKPOINTS_MAP = {
+    &#34;small&#34;: &#34;facebook/musicgen-small&#34;,
+    &#34;medium&#34;: &#34;facebook/musicgen-medium&#34;,
+    &#34;large&#34;: &#34;facebook/musicgen-large&#34;,
+    &#34;melody&#34;: &#34;facebook/musicgen-melody&#34;,
+}
+
+
+class MusicGen:
+    &#34;&#34;&#34;MusicGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.cfg: tp.Optional[omegaconf.DictConfig] = None
+        # Just to be safe, let&#39;s put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
+        if hasattr(lm, &#39;cfg&#39;):
+            cfg = lm.cfg
+            assert isinstance(cfg, omegaconf.DictConfig)
+            self.cfg = cfg
+
+        if self.cfg is not None:
+            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
+
+        if max_duration is None:
+            if self.cfg is not None:
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError(&#34;You must provide max_duration when building directly MusicGen&#34;)
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.device = next(iter(lm.parameters())).device
+
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=15)  # 15 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; float:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;facebook/musicgen-melody&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide four models:
+        - facebook/musicgen-small (300M), text to music,
+          # see: https://huggingface.co/facebook/musicgen-small
+        - facebook/musicgen-medium (1.5B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-medium
+        - facebook/musicgen-melody (1.5B) text to music and text+melody to music,
+          # see: https://huggingface.co/facebook/musicgen-melody
+        - facebook/musicgen-large (3.3B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-large
+        &#34;&#34;&#34;
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device)
+            lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm, max_duration=30)
+
+        if name in _HF_MODEL_CHECKPOINTS_MAP:
+            warnings.warn(
+                &#34;MusicGen pretrained model relying on deprecated checkpoint mapping. &#34; +
+                f&#34;Please use full pre-trained id instead: facebook/musicgen-{name}&#34;)
+            name = _HF_MODEL_CHECKPOINTS_MAP[name]
+
+        lm = load_lm_model(name, device=device)
+        compression_model = load_compression_model(name, device=device)
+        if &#39;self_wav&#39; in lm.condition_provider.conditioners:
+            lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+            lm.condition_provider.conditioners[&#39;self_wav&#39;]._use_masking = False
+
+        return MusicGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
+        &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate_unconditional(self, num_samples: int, progress: bool = False,
+                               return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
+            -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             melody_sample_rate: int, progress: bool = False,
+                             return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                      tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+        melody_wavs = [
+            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False, return_tokens: bool = False) \
+            -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+            melody_wavs: tp.Optional[MelodyList] = None,
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (torch.Tensor, optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if melody_wavs is None:
+            for attr in attributes:
+                attr.wav[&#39;self_wav&#39;] = WavCondition(
+                    torch.zeros((1, 1, 1), device=self.device),
+                    torch.tensor([0], device=self.device),
+                    sample_rate=[self.sample_rate],
+                    path=[None])
+        else:
+            if &#39;self_wav&#39; not in self.lm.condition_provider.conditioners:
+                raise RuntimeError(&#34;This model doesn&#39;t support melody conditioning. &#34;
+                                   &#34;Use the `melody` model.&#34;)
+            assert len(melody_wavs) == len(descriptions), \
+                f&#34;number of melody wavs must match number of descriptions! &#34; \
+                f&#34;got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}&#34;
+            for attr, melody in zip(attributes, melody_wavs):
+                if melody is None:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        torch.zeros((1, 1, 1), device=self.device),
+                        torch.tensor([0], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None])
+                else:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        melody[None].to(device=self.device),
+                        torch.tensor([melody.shape[-1]], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None],
+                    )
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav[&#39;self_wav&#39;] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn&#39;t have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        ref_wav[0][..., positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length),
+                        [self.sample_rate] * ref_wav[0].size(0),
+                        [None], [0.])
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+        return gen_tokens
+
+    def generate_audio(self, gen_tokens: torch.Tensor):
+        &#34;&#34;&#34;Generate Audio from tokens&#34;&#34;&#34;
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen"><code class="flex name class">
+<span>class <span class="ident">MusicGen</span></span>
+<span>(</span><span>name: str, compression_model: <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, lm: <a title="audiocraft.models.lm.LMModel" href="lm.html#audiocraft.models.lm.LMModel">LMModel</a>, max_duration: Optional[float] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>MusicGen main model with convenient generation API.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code></dt>
+<dd>name of the model.</dd>
+<dt><strong><code>compression_model</code></strong> :&ensp;<code>CompressionModel</code></dt>
+<dd>Compression model
+used to map audio to invertible discrete representations.</dd>
+<dt><strong><code>lm</code></strong> :&ensp;<code>LMModel</code></dt>
+<dd>Language model over discrete representations.</dd>
+<dt><strong><code>max_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>maximum duration the model can produce,
+otherwise, inferred from the training params.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicGen:
+    &#34;&#34;&#34;MusicGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.cfg: tp.Optional[omegaconf.DictConfig] = None
+        # Just to be safe, let&#39;s put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
+        if hasattr(lm, &#39;cfg&#39;):
+            cfg = lm.cfg
+            assert isinstance(cfg, omegaconf.DictConfig)
+            self.cfg = cfg
+
+        if self.cfg is not None:
+            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
+
+        if max_duration is None:
+            if self.cfg is not None:
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError(&#34;You must provide max_duration when building directly MusicGen&#34;)
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.device = next(iter(lm.parameters())).device
+
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=15)  # 15 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; float:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;facebook/musicgen-melody&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide four models:
+        - facebook/musicgen-small (300M), text to music,
+          # see: https://huggingface.co/facebook/musicgen-small
+        - facebook/musicgen-medium (1.5B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-medium
+        - facebook/musicgen-melody (1.5B) text to music and text+melody to music,
+          # see: https://huggingface.co/facebook/musicgen-melody
+        - facebook/musicgen-large (3.3B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-large
+        &#34;&#34;&#34;
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device)
+            lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm, max_duration=30)
+
+        if name in _HF_MODEL_CHECKPOINTS_MAP:
+            warnings.warn(
+                &#34;MusicGen pretrained model relying on deprecated checkpoint mapping. &#34; +
+                f&#34;Please use full pre-trained id instead: facebook/musicgen-{name}&#34;)
+            name = _HF_MODEL_CHECKPOINTS_MAP[name]
+
+        lm = load_lm_model(name, device=device)
+        compression_model = load_compression_model(name, device=device)
+        if &#39;self_wav&#39; in lm.condition_provider.conditioners:
+            lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+            lm.condition_provider.conditioners[&#39;self_wav&#39;]._use_masking = False
+
+        return MusicGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
+        &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate_unconditional(self, num_samples: int, progress: bool = False,
+                               return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
+            -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             melody_sample_rate: int, progress: bool = False,
+                             return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                      tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+        melody_wavs = [
+            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False, return_tokens: bool = False) \
+            -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+            melody_wavs: tp.Optional[MelodyList] = None,
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (torch.Tensor, optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if melody_wavs is None:
+            for attr in attributes:
+                attr.wav[&#39;self_wav&#39;] = WavCondition(
+                    torch.zeros((1, 1, 1), device=self.device),
+                    torch.tensor([0], device=self.device),
+                    sample_rate=[self.sample_rate],
+                    path=[None])
+        else:
+            if &#39;self_wav&#39; not in self.lm.condition_provider.conditioners:
+                raise RuntimeError(&#34;This model doesn&#39;t support melody conditioning. &#34;
+                                   &#34;Use the `melody` model.&#34;)
+            assert len(melody_wavs) == len(descriptions), \
+                f&#34;number of melody wavs must match number of descriptions! &#34; \
+                f&#34;got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}&#34;
+            for attr, melody in zip(attributes, melody_wavs):
+                if melody is None:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        torch.zeros((1, 1, 1), device=self.device),
+                        torch.tensor([0], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None])
+                else:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        melody[None].to(device=self.device),
+                        torch.tensor([melody.shape[-1]], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None],
+                    )
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav[&#39;self_wav&#39;] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn&#39;t have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        ref_wav[0][..., positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length),
+                        [self.sample_rate] * ref_wav[0].size(0),
+                        [None], [0.])
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+        return gen_tokens
+
+    def generate_audio(self, gen_tokens: torch.Tensor):
+        &#34;&#34;&#34;Generate Audio from tokens&#34;&#34;&#34;
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.get_pretrained"><code class="name flex">
+<span>def <span class="ident">get_pretrained</span></span>(<span>name: str = 'facebook/musicgen-melody', device=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return pretrained model, we provide four models:
+- facebook/musicgen-small (300M), text to music,
+# see: <a href="https://huggingface.co/facebook/musicgen-small">https://huggingface.co/facebook/musicgen-small</a>
+- facebook/musicgen-medium (1.5B), text to music,
+# see: <a href="https://huggingface.co/facebook/musicgen-medium">https://huggingface.co/facebook/musicgen-medium</a>
+- facebook/musicgen-melody (1.5B) text to music and text+melody to music,
+# see: <a href="https://huggingface.co/facebook/musicgen-melody">https://huggingface.co/facebook/musicgen-melody</a>
+- facebook/musicgen-large (3.3B), text to music,
+# see: <a href="https://huggingface.co/facebook/musicgen-large">https://huggingface.co/facebook/musicgen-large</a></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_pretrained(name: str = &#39;facebook/musicgen-melody&#39;, device=None):
+    &#34;&#34;&#34;Return pretrained model, we provide four models:
+    - facebook/musicgen-small (300M), text to music,
+      # see: https://huggingface.co/facebook/musicgen-small
+    - facebook/musicgen-medium (1.5B), text to music,
+      # see: https://huggingface.co/facebook/musicgen-medium
+    - facebook/musicgen-melody (1.5B) text to music and text+melody to music,
+      # see: https://huggingface.co/facebook/musicgen-melody
+    - facebook/musicgen-large (3.3B), text to music,
+      # see: https://huggingface.co/facebook/musicgen-large
+    &#34;&#34;&#34;
+    if device is None:
+        if torch.cuda.device_count():
+            device = &#39;cuda&#39;
+        else:
+            device = &#39;cpu&#39;
+
+    if name == &#39;debug&#39;:
+        # used only for unit tests
+        compression_model = get_debug_compression_model(device)
+        lm = get_debug_lm_model(device)
+        return MusicGen(name, compression_model, lm, max_duration=30)
+
+    if name in _HF_MODEL_CHECKPOINTS_MAP:
+        warnings.warn(
+            &#34;MusicGen pretrained model relying on deprecated checkpoint mapping. &#34; +
+            f&#34;Please use full pre-trained id instead: facebook/musicgen-{name}&#34;)
+        name = _HF_MODEL_CHECKPOINTS_MAP[name]
+
+    lm = load_lm_model(name, device=device)
+    compression_model = load_compression_model(name, device=device)
+    if &#39;self_wav&#39; in lm.condition_provider.conditioners:
+        lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+        lm.condition_provider.conditioners[&#39;self_wav&#39;]._use_masking = False
+
+    return MusicGen(name, compression_model, lm)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.audio_channels"><code class="name">var <span class="ident">audio_channels</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Audio channels of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def audio_channels(self) -&gt; int:
+    &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.channels</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : float</code></dt>
+<dd>
+<div class="desc"><p>Roughly the number of AR steps per seconds.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; float:
+    &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+    return self.compression_model.frame_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Sample rate of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.sample_rate</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, descriptions: List[str], progress: bool = False, return_tokens: bool = False) ‑> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on text.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>A list of strings used as text conditioning.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
+        -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+    &#34;&#34;&#34;Generate samples conditioned on text.
+
+    Args:
+        descriptions (list of str): A list of strings used as text conditioning.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+    assert prompt_tokens is None
+    tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+    if return_tokens:
+        return self.generate_audio(tokens), tokens
+    return self.generate_audio(tokens)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_audio"><code class="name flex">
+<span>def <span class="ident">generate_audio</span></span>(<span>self, gen_tokens: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate Audio from tokens</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_audio(self, gen_tokens: torch.Tensor):
+    &#34;&#34;&#34;Generate Audio from tokens&#34;&#34;&#34;
+    assert gen_tokens.dim() == 3
+    with torch.no_grad():
+        gen_audio = self.compression_model.decode(gen_tokens, None)
+    return gen_audio</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_continuation"><code class="name flex">
+<span>def <span class="ident">generate_continuation</span></span>(<span>self, prompt: torch.Tensor, prompt_sample_rate: int, descriptions: Optional[List[Optional[str]]] = None, progress: bool = False, return_tokens: bool = False) ‑> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on audio prompts.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>prompt</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>A batch of waveforms used for continuation.
+Prompt should be [B, C, T], or [C, T] if only one sample is generated.</dd>
+<dt><strong><code>prompt_sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sampling rate of the given audio waveforms.</dd>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>list</code> of <code>str</code>, optional</dt>
+<dd>A list of strings used as text conditioning. Defaults to None.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                          descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                          progress: bool = False, return_tokens: bool = False) \
+        -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+    &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+    Args:
+        prompt (torch.Tensor): A batch of waveforms used for continuation.
+            Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+        prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+        descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    if prompt.dim() == 2:
+        prompt = prompt[None]
+    if prompt.dim() != 3:
+        raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+    prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+    if descriptions is None:
+        descriptions = [None] * len(prompt)
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+    assert prompt_tokens is not None
+    tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+    if return_tokens:
+        return self.generate_audio(tokens), tokens
+    return self.generate_audio(tokens)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_unconditional"><code class="name flex">
+<span>def <span class="ident">generate_unconditional</span></span>(<span>self, num_samples: int, progress: bool = False, return_tokens: bool = False) ‑> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples in an unconditional manner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of samples to be generated.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_unconditional(self, num_samples: int, progress: bool = False,
+                           return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                    tp.Tuple[torch.Tensor, torch.Tensor]]:
+    &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+    Args:
+        num_samples (int): Number of samples to be generated.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+    tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+    if return_tokens:
+        return self.generate_audio(tokens), tokens
+    return self.generate_audio(tokens)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_with_chroma"><code class="name flex">
+<span>def <span class="ident">generate_with_chroma</span></span>(<span>self, descriptions: List[str], melody_wavs: Union[torch.Tensor, List[Optional[torch.Tensor]]], melody_sample_rate: int, progress: bool = False, return_tokens: bool = False) ‑> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on text and melody.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>A list of strings used as text conditioning.</dd>
+<dt><strong><code>melody_wavs</code></strong></dt>
+<dd>(torch.Tensor or list of Tensor): A batch of waveforms used as
+melody conditioning. Should have shape [B, C, T] with B matching the description length,
+C=1 or 2. It can be [C, T] if there is a single description. It can also be
+a list of [C, T] tensors.</dd>
+<dt><strong><code>melody_sample_rate</code></strong></dt>
+<dd>(int): Sample rate of the melody waveforms.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                         melody_sample_rate: int, progress: bool = False,
+                         return_tokens: bool = False) -&gt; tp.Union[torch.Tensor,
+                                                                  tp.Tuple[torch.Tensor, torch.Tensor]]:
+    &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+    Args:
+        descriptions (list of str): A list of strings used as text conditioning.
+        melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+            melody conditioning. Should have shape [B, C, T] with B matching the description length,
+            C=1 or 2. It can be [C, T] if there is a single description. It can also be
+            a list of [C, T] tensors.
+        melody_sample_rate: (int): Sample rate of the melody waveforms.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    if isinstance(melody_wavs, torch.Tensor):
+        if melody_wavs.dim() == 2:
+            melody_wavs = melody_wavs[None]
+        if melody_wavs.dim() != 3:
+            raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+        melody_wavs = list(melody_wavs)
+    else:
+        for melody in melody_wavs:
+            if melody is not None:
+                assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+    melody_wavs = [
+        convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+        if wav is not None else None
+        for wav in melody_wavs]
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                    melody_wavs=melody_wavs)
+    assert prompt_tokens is None
+    tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+    if return_tokens:
+        return self.generate_audio(tokens), tokens
+    return self.generate_audio(tokens)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.set_custom_progress_callback"><code class="name flex">
+<span>def <span class="ident">set_custom_progress_callback</span></span>(<span>self, progress_callback: Optional[Callable[[int, int], None]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Override the default progress callback.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+    &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+    self._progress_callback = progress_callback</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.set_generation_params"><code class="name flex">
+<span>def <span class="ident">set_generation_params</span></span>(<span>self, use_sampling: bool = True, top_k: int = 250, top_p: float = 0.0, temperature: float = 1.0, duration: float = 30.0, cfg_coef: float = 3.0, two_step_cfg: bool = False, extend_stride: float = 18)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the generation parameters for MusicGen.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>use_sampling</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Use sampling if True, else do argmax decoding. Defaults to True.</dd>
+<dt><strong><code>top_k</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>top_k used for sampling. Defaults to 250.</dd>
+<dt><strong><code>top_p</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.</dd>
+<dt><strong><code>temperature</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Softmax temperature parameter. Defaults to 1.0.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Duration of the generated waveform. Defaults to 30.0.</dd>
+<dt><strong><code>cfg_coef</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Coefficient used for classifier free guidance. Defaults to 3.0.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>If True, performs 2 forward for Classifier Free Guidance,
+instead of batching together the two. This has some impact on how things
+are padded but seems to have little impact in practice.</dd>
+<dt><strong><code>extend_stride</code></strong></dt>
+<dd>when doing extended generation (i.e. more than 30 seconds), by how much
+should we extend the audio each time. Larger values will mean less context is
+preserved, and shorter value will require extra computations.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                          top_p: float = 0.0, temperature: float = 1.0,
+                          duration: float = 30.0, cfg_coef: float = 3.0,
+                          two_step_cfg: bool = False, extend_stride: float = 18):
+    &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+    Args:
+        use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+        top_k (int, optional): top_k used for sampling. Defaults to 250.
+        top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+        temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+        duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+        cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+        two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+            instead of batching together the two. This has some impact on how things
+            are padded but seems to have little impact in practice.
+        extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+            should we extend the audio each time. Larger values will mean less context is
+            preserved, and shorter value will require extra computations.
+    &#34;&#34;&#34;
+    assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+    self.extend_stride = extend_stride
+    self.duration = duration
+    self.generation_params = {
+        &#39;use_sampling&#39;: use_sampling,
+        &#39;temp&#39;: temperature,
+        &#39;top_k&#39;: top_k,
+        &#39;top_p&#39;: top_p,
+        &#39;cfg_coef&#39;: cfg_coef,
+        &#39;two_step_cfg&#39;: two_step_cfg,
+    }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.musicgen.MusicGen" href="#audiocraft.models.musicgen.MusicGen">MusicGen</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.musicgen.MusicGen.audio_channels" href="#audiocraft.models.musicgen.MusicGen.audio_channels">audio_channels</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.frame_rate" href="#audiocraft.models.musicgen.MusicGen.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate" href="#audiocraft.models.musicgen.MusicGen.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_audio" href="#audiocraft.models.musicgen.MusicGen.generate_audio">generate_audio</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_continuation" href="#audiocraft.models.musicgen.MusicGen.generate_continuation">generate_continuation</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_unconditional" href="#audiocraft.models.musicgen.MusicGen.generate_unconditional">generate_unconditional</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_with_chroma" href="#audiocraft.models.musicgen.MusicGen.generate_with_chroma">generate_with_chroma</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.get_pretrained" href="#audiocraft.models.musicgen.MusicGen.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.sample_rate" href="#audiocraft.models.musicgen.MusicGen.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.set_custom_progress_callback" href="#audiocraft.models.musicgen.MusicGen.set_custom_progress_callback">set_custom_progress_callback</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.set_generation_params" href="#audiocraft.models.musicgen.MusicGen.set_generation_params">set_generation_params</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/models/unet.html b/api_docs/audiocraft/models/unet.html
new file mode 100644
index 00000000..defbb8c7
--- /dev/null
+++ b/api_docs/audiocraft/models/unet.html
@@ -0,0 +1,1004 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.unet API documentation</title>
+<meta name="description" content="Pytorch Unet Module used for diffusion." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.unet</code></h1>
+</header>
+<section id="section-intro">
+<p>Pytorch Unet Module used for diffusion.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Pytorch Unet Module used for diffusion.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass
+import typing as tp
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from audiocraft.modules.transformer import StreamingTransformer, create_sin_embedding
+
+
+@dataclass
+class Output:
+    sample: torch.Tensor
+
+
+def get_model(cfg, channels: int, side: int, num_steps: int):
+    if cfg.model == &#39;unet&#39;:
+        return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
+    else:
+        raise RuntimeError(&#39;Not Implemented&#39;)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, channels: int, kernel: int = 3, norm_groups: int = 4,
+                 dilation: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        stride = 1
+        padding = dilation * (kernel - stride) // 2
+        Conv = nn.Conv1d
+        Drop = nn.Dropout1d
+        self.norm1 = nn.GroupNorm(norm_groups, channels)
+        self.conv1 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation1 = activation()
+        self.dropout1 = Drop(dropout)
+
+        self.norm2 = nn.GroupNorm(norm_groups, channels)
+        self.conv2 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation2 = activation()
+        self.dropout2 = Drop(dropout)
+
+    def forward(self, x):
+        h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
+        h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
+        return x + h
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chin, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+        self.norm = nn.GroupNorm(norm_groups, chin)
+        ConvTr = nn.ConvTranspose1d
+        self.convtr = ConvTr(chin, chout, kernel, stride, padding, bias=False)
+        self.activation = activation()
+
+    def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+        x = self.res_blocks(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.convtr(x)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        Conv = nn.Conv1d
+        self.conv = Conv(chin, chout, kernel, stride, padding, bias=False)
+        self.norm = nn.GroupNorm(norm_groups, chout)
+        self.activation = activation()
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chout, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+
+    def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = x.shape
+        stride, = self.conv.stride
+        pad = (stride - (T % stride)) % stride
+        x = F.pad(x, (0, pad))
+
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.res_blocks(x)
+        return x
+
+
+class BLSTM(nn.Module):
+    &#34;&#34;&#34;BiLSTM with same hidden units as input dim.
+    &#34;&#34;&#34;
+    def __init__(self, dim, layers=2):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        return x
+
+
+class DiffusionUnet(nn.Module):
+    def __init__(self, chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.,
+                 max_channels: int = 10_000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False,
+                 bilstm: bool = False, transformer: bool = False,
+                 codec_dim: tp.Optional[int] = None, **kwargs):
+        super().__init__()
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.embeddings: tp.Optional[nn.ModuleList] = None
+        self.embedding = nn.Embedding(num_steps, hidden)
+        if emb_all_layers:
+            self.embeddings = nn.ModuleList()
+        self.condition_embedding: tp.Optional[nn.Module] = None
+        for d in range(depth):
+            encoder = EncoderLayer(chin, hidden, **kwargs)
+            decoder = DecoderLayer(hidden, chin, **kwargs)
+            self.encoders.append(encoder)
+            self.decoders.insert(0, decoder)
+            if emb_all_layers and d &gt; 0:
+                assert self.embeddings is not None
+                self.embeddings.append(nn.Embedding(num_steps, hidden))
+            chin = hidden
+            hidden = min(int(chin * growth), max_channels)
+        self.bilstm: tp.Optional[nn.Module]
+        if bilstm:
+            self.bilstm = BLSTM(chin)
+        else:
+            self.bilstm = None
+        self.use_transformer = transformer
+        self.cross_attention = False
+        if transformer:
+            self.cross_attention = cross_attention
+            self.transformer = StreamingTransformer(chin, 8, 6, bias_ff=False, bias_attn=False,
+                                                    cross_attention=cross_attention)
+
+        self.use_codec = False
+        if codec_dim is not None:
+            self.conv_codec = nn.Conv1d(codec_dim, chin, 1)
+            self.use_codec = True
+
+    def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
+        skips = []
+        bs = x.size(0)
+        z = x
+        view_args = [1]
+        if type(step) is torch.Tensor:
+            step_tensor = step
+        else:
+            step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
+
+        for idx, encoder in enumerate(self.encoders):
+            z = encoder(z)
+            if idx == 0:
+                z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
+            elif self.embeddings is not None:
+                z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
+
+            skips.append(z)
+
+        if self.use_codec:  # insert condition in the bottleneck
+            assert condition is not None, &#34;Model defined for conditionnal generation&#34;
+            condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
+            assert condition_emb.size(-1) &lt;= 2 * z.size(-1), \
+                f&#34;You are downsampling the conditionning with factor &gt;=2 : {condition_emb.size(-1)=} and {z.size(-1)=}&#34;
+            if not self.cross_attention:
+
+                condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
+                assert z.size() == condition_emb.size()
+                z += condition_emb
+                cross_attention_src = None
+            else:
+                cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
+                B, T, C = cross_attention_src.shape
+                positions = torch.arange(T, device=x.device).view(1, -1, 1)
+                pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
+                cross_attention_src = cross_attention_src + pos_emb
+        if self.use_transformer:
+            z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
+        else:
+            if self.bilstm is None:
+                z = torch.zeros_like(z)
+            else:
+                z = self.bilstm(z)
+
+        for decoder in self.decoders:
+            s = skips.pop(-1)
+            z = z[:, :, :s.shape[2]]
+            z = z + s
+            z = decoder(z)
+
+        z = z[:, :, :x.shape[2]]
+        return Output(z)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.unet.get_model"><code class="name flex">
+<span>def <span class="ident">get_model</span></span>(<span>cfg, channels: int, side: int, num_steps: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_model(cfg, channels: int, side: int, num_steps: int):
+    if cfg.model == &#39;unet&#39;:
+        return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
+    else:
+        raise RuntimeError(&#39;Not Implemented&#39;)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.unet.BLSTM"><code class="flex name class">
+<span>class <span class="ident">BLSTM</span></span>
+<span>(</span><span>dim, layers=2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>BiLSTM with same hidden units as input dim.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BLSTM(nn.Module):
+    &#34;&#34;&#34;BiLSTM with same hidden units as input dim.
+    &#34;&#34;&#34;
+    def __init__(self, dim, layers=2):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.BLSTM.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.BLSTM.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.BLSTM.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.unet.BLSTM.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = x.permute(2, 0, 1)
+    x = self.lstm(x)[0]
+    x = self.linear(x)
+    x = x.permute(1, 2, 0)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.unet.DecoderLayer"><code class="flex name class">
+<span>class <span class="ident">DecoderLayer</span></span>
+<span>(</span><span>chin: int, chout: int, kernel: int = 4, stride: int = 2, norm_groups: int = 4, res_blocks: int = 1, activation: Type[torch.nn.modules.module.Module] = torch.nn.modules.activation.ReLU, dropout: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all neural network modules.</p>
+<p>Your models should also subclass this class.</p>
+<p>Modules can also contain other Modules, allowing to nest them in
+a tree structure. You can assign the submodules as regular attributes::</p>
+<pre><code>import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+</code></pre>
+<p>Submodules assigned in this way will be registered, and will have their
+parameters converted too when you call :meth:<code>to</code>, etc.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>As per the example above, an <code>__init__()</code> call to the parent class
+must be made before assignment on the child.</p>
+</div>
+<p>:ivar training: Boolean represents whether this module is in training or
+evaluation mode.
+:vartype training: bool</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DecoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chin, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+        self.norm = nn.GroupNorm(norm_groups, chin)
+        ConvTr = nn.ConvTranspose1d
+        self.convtr = ConvTr(chin, chout, kernel, stride, padding, bias=False)
+        self.activation = activation()
+
+    def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+        x = self.res_blocks(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.convtr(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.DecoderLayer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.DecoderLayer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.DecoderLayer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.unet.DecoderLayer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+    x = self.res_blocks(x)
+    x = self.norm(x)
+    x = self.activation(x)
+    x = self.convtr(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.unet.DiffusionUnet"><code class="flex name class">
+<span>class <span class="ident">DiffusionUnet</span></span>
+<span>(</span><span>chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.0, max_channels: int = 10000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False, bilstm: bool = False, transformer: bool = False, codec_dim: Optional[int] = None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all neural network modules.</p>
+<p>Your models should also subclass this class.</p>
+<p>Modules can also contain other Modules, allowing to nest them in
+a tree structure. You can assign the submodules as regular attributes::</p>
+<pre><code>import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+</code></pre>
+<p>Submodules assigned in this way will be registered, and will have their
+parameters converted too when you call :meth:<code>to</code>, etc.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>As per the example above, an <code>__init__()</code> call to the parent class
+must be made before assignment on the child.</p>
+</div>
+<p>:ivar training: Boolean represents whether this module is in training or
+evaluation mode.
+:vartype training: bool</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DiffusionUnet(nn.Module):
+    def __init__(self, chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.,
+                 max_channels: int = 10_000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False,
+                 bilstm: bool = False, transformer: bool = False,
+                 codec_dim: tp.Optional[int] = None, **kwargs):
+        super().__init__()
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.embeddings: tp.Optional[nn.ModuleList] = None
+        self.embedding = nn.Embedding(num_steps, hidden)
+        if emb_all_layers:
+            self.embeddings = nn.ModuleList()
+        self.condition_embedding: tp.Optional[nn.Module] = None
+        for d in range(depth):
+            encoder = EncoderLayer(chin, hidden, **kwargs)
+            decoder = DecoderLayer(hidden, chin, **kwargs)
+            self.encoders.append(encoder)
+            self.decoders.insert(0, decoder)
+            if emb_all_layers and d &gt; 0:
+                assert self.embeddings is not None
+                self.embeddings.append(nn.Embedding(num_steps, hidden))
+            chin = hidden
+            hidden = min(int(chin * growth), max_channels)
+        self.bilstm: tp.Optional[nn.Module]
+        if bilstm:
+            self.bilstm = BLSTM(chin)
+        else:
+            self.bilstm = None
+        self.use_transformer = transformer
+        self.cross_attention = False
+        if transformer:
+            self.cross_attention = cross_attention
+            self.transformer = StreamingTransformer(chin, 8, 6, bias_ff=False, bias_attn=False,
+                                                    cross_attention=cross_attention)
+
+        self.use_codec = False
+        if codec_dim is not None:
+            self.conv_codec = nn.Conv1d(codec_dim, chin, 1)
+            self.use_codec = True
+
+    def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
+        skips = []
+        bs = x.size(0)
+        z = x
+        view_args = [1]
+        if type(step) is torch.Tensor:
+            step_tensor = step
+        else:
+            step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
+
+        for idx, encoder in enumerate(self.encoders):
+            z = encoder(z)
+            if idx == 0:
+                z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
+            elif self.embeddings is not None:
+                z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
+
+            skips.append(z)
+
+        if self.use_codec:  # insert condition in the bottleneck
+            assert condition is not None, &#34;Model defined for conditionnal generation&#34;
+            condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
+            assert condition_emb.size(-1) &lt;= 2 * z.size(-1), \
+                f&#34;You are downsampling the conditionning with factor &gt;=2 : {condition_emb.size(-1)=} and {z.size(-1)=}&#34;
+            if not self.cross_attention:
+
+                condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
+                assert z.size() == condition_emb.size()
+                z += condition_emb
+                cross_attention_src = None
+            else:
+                cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
+                B, T, C = cross_attention_src.shape
+                positions = torch.arange(T, device=x.device).view(1, -1, 1)
+                pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
+                cross_attention_src = cross_attention_src + pos_emb
+        if self.use_transformer:
+            z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
+        else:
+            if self.bilstm is None:
+                z = torch.zeros_like(z)
+            else:
+                z = self.bilstm(z)
+
+        for decoder in self.decoders:
+            s = skips.pop(-1)
+            z = z[:, :, :s.shape[2]]
+            z = z + s
+            z = decoder(z)
+
+        z = z[:, :, :x.shape[2]]
+        return Output(z)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.DiffusionUnet.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.DiffusionUnet.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.DiffusionUnet.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.unet.DiffusionUnet.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, step: Union[int, torch.Tensor], condition: Optional[torch.Tensor] = None) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
+    skips = []
+    bs = x.size(0)
+    z = x
+    view_args = [1]
+    if type(step) is torch.Tensor:
+        step_tensor = step
+    else:
+        step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
+
+    for idx, encoder in enumerate(self.encoders):
+        z = encoder(z)
+        if idx == 0:
+            z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
+        elif self.embeddings is not None:
+            z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
+
+        skips.append(z)
+
+    if self.use_codec:  # insert condition in the bottleneck
+        assert condition is not None, &#34;Model defined for conditionnal generation&#34;
+        condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
+        assert condition_emb.size(-1) &lt;= 2 * z.size(-1), \
+            f&#34;You are downsampling the conditionning with factor &gt;=2 : {condition_emb.size(-1)=} and {z.size(-1)=}&#34;
+        if not self.cross_attention:
+
+            condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
+            assert z.size() == condition_emb.size()
+            z += condition_emb
+            cross_attention_src = None
+        else:
+            cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
+            B, T, C = cross_attention_src.shape
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
+            cross_attention_src = cross_attention_src + pos_emb
+    if self.use_transformer:
+        z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
+    else:
+        if self.bilstm is None:
+            z = torch.zeros_like(z)
+        else:
+            z = self.bilstm(z)
+
+    for decoder in self.decoders:
+        s = skips.pop(-1)
+        z = z[:, :, :s.shape[2]]
+        z = z + s
+        z = decoder(z)
+
+    z = z[:, :, :x.shape[2]]
+    return Output(z)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.unet.EncoderLayer"><code class="flex name class">
+<span>class <span class="ident">EncoderLayer</span></span>
+<span>(</span><span>chin: int, chout: int, kernel: int = 4, stride: int = 2, norm_groups: int = 4, res_blocks: int = 1, activation: Type[torch.nn.modules.module.Module] = torch.nn.modules.activation.ReLU, dropout: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all neural network modules.</p>
+<p>Your models should also subclass this class.</p>
+<p>Modules can also contain other Modules, allowing to nest them in
+a tree structure. You can assign the submodules as regular attributes::</p>
+<pre><code>import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+</code></pre>
+<p>Submodules assigned in this way will be registered, and will have their
+parameters converted too when you call :meth:<code>to</code>, etc.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>As per the example above, an <code>__init__()</code> call to the parent class
+must be made before assignment on the child.</p>
+</div>
+<p>:ivar training: Boolean represents whether this module is in training or
+evaluation mode.
+:vartype training: bool</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EncoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        Conv = nn.Conv1d
+        self.conv = Conv(chin, chout, kernel, stride, padding, bias=False)
+        self.norm = nn.GroupNorm(norm_groups, chout)
+        self.activation = activation()
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chout, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+
+    def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+        B, C, T = x.shape
+        stride, = self.conv.stride
+        pad = (stride - (T % stride)) % stride
+        x = F.pad(x, (0, pad))
+
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.res_blocks(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.EncoderLayer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.EncoderLayer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.EncoderLayer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.unet.EncoderLayer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
+    B, C, T = x.shape
+    stride, = self.conv.stride
+    pad = (stride - (T % stride)) % stride
+    x = F.pad(x, (0, pad))
+
+    x = self.conv(x)
+    x = self.norm(x)
+    x = self.activation(x)
+    x = self.res_blocks(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.unet.Output"><code class="flex name class">
+<span>class <span class="ident">Output</span></span>
+<span>(</span><span>sample: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Output(sample: torch.Tensor)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Output:
+    sample: torch.Tensor</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.Output.sample"><code class="name">var <span class="ident">sample</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.unet.ResBlock"><code class="flex name class">
+<span>class <span class="ident">ResBlock</span></span>
+<span>(</span><span>channels: int, kernel: int = 3, norm_groups: int = 4, dilation: int = 1, activation: Type[torch.nn.modules.module.Module] = torch.nn.modules.activation.ReLU, dropout: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all neural network modules.</p>
+<p>Your models should also subclass this class.</p>
+<p>Modules can also contain other Modules, allowing to nest them in
+a tree structure. You can assign the submodules as regular attributes::</p>
+<pre><code>import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+</code></pre>
+<p>Submodules assigned in this way will be registered, and will have their
+parameters converted too when you call :meth:<code>to</code>, etc.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>As per the example above, an <code>__init__()</code> call to the parent class
+must be made before assignment on the child.</p>
+</div>
+<p>:ivar training: Boolean represents whether this module is in training or
+evaluation mode.
+:vartype training: bool</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ResBlock(nn.Module):
+    def __init__(self, channels: int, kernel: int = 3, norm_groups: int = 4,
+                 dilation: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        stride = 1
+        padding = dilation * (kernel - stride) // 2
+        Conv = nn.Conv1d
+        Drop = nn.Dropout1d
+        self.norm1 = nn.GroupNorm(norm_groups, channels)
+        self.conv1 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation1 = activation()
+        self.dropout1 = Drop(dropout)
+
+        self.norm2 = nn.GroupNorm(norm_groups, channels)
+        self.conv2 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation2 = activation()
+        self.dropout2 = Drop(dropout)
+
+    def forward(self, x):
+        h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
+        h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
+        return x + h</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.unet.ResBlock.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.ResBlock.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.unet.ResBlock.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.unet.ResBlock.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
+    h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
+    return x + h</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.get_model" href="#audiocraft.models.unet.get_model">get_model</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.unet.BLSTM" href="#audiocraft.models.unet.BLSTM">BLSTM</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.BLSTM.call_super_init" href="#audiocraft.models.unet.BLSTM.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.unet.BLSTM.dump_patches" href="#audiocraft.models.unet.BLSTM.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.unet.BLSTM.forward" href="#audiocraft.models.unet.BLSTM.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.unet.BLSTM.training" href="#audiocraft.models.unet.BLSTM.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.unet.DecoderLayer" href="#audiocraft.models.unet.DecoderLayer">DecoderLayer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.DecoderLayer.call_super_init" href="#audiocraft.models.unet.DecoderLayer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.unet.DecoderLayer.dump_patches" href="#audiocraft.models.unet.DecoderLayer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.unet.DecoderLayer.forward" href="#audiocraft.models.unet.DecoderLayer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.unet.DecoderLayer.training" href="#audiocraft.models.unet.DecoderLayer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.unet.DiffusionUnet" href="#audiocraft.models.unet.DiffusionUnet">DiffusionUnet</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.DiffusionUnet.call_super_init" href="#audiocraft.models.unet.DiffusionUnet.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.unet.DiffusionUnet.dump_patches" href="#audiocraft.models.unet.DiffusionUnet.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.unet.DiffusionUnet.forward" href="#audiocraft.models.unet.DiffusionUnet.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.unet.DiffusionUnet.training" href="#audiocraft.models.unet.DiffusionUnet.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.unet.EncoderLayer" href="#audiocraft.models.unet.EncoderLayer">EncoderLayer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.EncoderLayer.call_super_init" href="#audiocraft.models.unet.EncoderLayer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.unet.EncoderLayer.dump_patches" href="#audiocraft.models.unet.EncoderLayer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.unet.EncoderLayer.forward" href="#audiocraft.models.unet.EncoderLayer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.unet.EncoderLayer.training" href="#audiocraft.models.unet.EncoderLayer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.unet.Output" href="#audiocraft.models.unet.Output">Output</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.Output.sample" href="#audiocraft.models.unet.Output.sample">sample</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.unet.ResBlock" href="#audiocraft.models.unet.ResBlock">ResBlock</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.unet.ResBlock.call_super_init" href="#audiocraft.models.unet.ResBlock.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.unet.ResBlock.dump_patches" href="#audiocraft.models.unet.ResBlock.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.unet.ResBlock.forward" href="#audiocraft.models.unet.ResBlock.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.unet.ResBlock.training" href="#audiocraft.models.unet.ResBlock.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/activations.html b/api_docs/audiocraft/modules/activations.html
new file mode 100644
index 00000000..f5ec87bf
--- /dev/null
+++ b/api_docs/audiocraft/modules/activations.html
@@ -0,0 +1,523 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.activations API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.activations</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Union, Callable
+
+
+class CustomGLU(nn.Module):
+    &#34;&#34;&#34;Custom Gated Linear Unit activation.
+    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
+    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
+    function (i.e. sigmoid, swish, etc.).
+
+    Args:
+        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+        &gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+        &gt;&gt;&gt; input = torch.randn(4, 2)
+        &gt;&gt;&gt; output = m(input)
+    &#34;&#34;&#34;
+    def __init__(self, activation: nn.Module, dim: int = -1):
+        super(CustomGLU, self).__init__()
+        self.dim = dim
+        self.activation = activation
+
+    def forward(self, x: Tensor):
+        assert x.shape[self.dim] % 2 == 0  # M = N / 2
+        a, b = torch.chunk(x, 2, dim=self.dim)
+        return a * self.activation(b)
+
+
+class SwiGLU(CustomGLU):
+    &#34;&#34;&#34;SiLU Gated Linear Unit activation.
+    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(SwiGLU, self).__init__(nn.SiLU(), dim)
+
+
+class GeGLU(CustomGLU):
+    &#34;&#34;&#34;GeLU Gated Linear Unit activation.
+    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(GeGLU, self).__init__(nn.GELU(), dim)
+
+
+class ReGLU(CustomGLU):
+    &#34;&#34;&#34;ReLU Gated Linear Unit activation.
+    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(ReGLU, self).__init__(nn.ReLU(), dim)
+
+
+def get_activation_fn(
+    activation: Union[str, Callable[[Tensor], Tensor]]
+) -&gt; Union[str, Callable[[Tensor], Tensor]]:
+    &#34;&#34;&#34;Helper function to map an activation string to the activation class.
+    If the supplied activation is not a string that is recognized, the activation is passed back.
+
+    Args:
+        activation (str, or Callable[[Tensor], Tensor]): Activation to check
+    &#34;&#34;&#34;
+    if isinstance(activation, str):
+        if activation == &#34;reglu&#34;:
+            return ReGLU()
+        elif activation == &#34;geglu&#34;:
+            return GeGLU()
+        elif activation == &#34;swiglu&#34;:
+            return SwiGLU()
+    return activation</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.activations.get_activation_fn"><code class="name flex">
+<span>def <span class="ident">get_activation_fn</span></span>(<span>activation: Union[str, Callable[[torch.Tensor], torch.Tensor]]) ‑> Union[str, Callable[[torch.Tensor], torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Helper function to map an activation string to the activation class.
+If the supplied activation is not a string that is recognized, the activation is passed back.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str,</code> or <code>Callable[[Tensor], Tensor]</code></dt>
+<dd>Activation to check</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_activation_fn(
+    activation: Union[str, Callable[[Tensor], Tensor]]
+) -&gt; Union[str, Callable[[Tensor], Tensor]]:
+    &#34;&#34;&#34;Helper function to map an activation string to the activation class.
+    If the supplied activation is not a string that is recognized, the activation is passed back.
+
+    Args:
+        activation (str, or Callable[[Tensor], Tensor]): Activation to check
+    &#34;&#34;&#34;
+    if isinstance(activation, str):
+        if activation == &#34;reglu&#34;:
+            return ReGLU()
+        elif activation == &#34;geglu&#34;:
+            return GeGLU()
+        elif activation == &#34;swiglu&#34;:
+            return SwiGLU()
+    return activation</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU"><code class="flex name class">
+<span>class <span class="ident">CustomGLU</span></span>
+<span>(</span><span>activation: torch.nn.modules.module.Module, dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Custom Gated Linear Unit activation.
+Applies a modified gated linear unit :math:<code>a * f(b)</code> where :math:<code>a</code> is the first half
+of the input matrices, :math:<code>b</code> is the second half, and :math:<code>f</code> is a provided activation
+function (i.e. sigmoid, swish, etc.).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>activation</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>The custom activation to apply in the Gated Linear Unit</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<h2 id="shape">Shape</h2>
+<ul>
+<li>Input: :math:<code>(st_1, N, st_2)</code> where <code>*</code> means, any number of additional
+dimensions</li>
+<li>Output: :math:<code>(st_1, M, st_2)</code> where :math:<code>M=N/2</code></li>
+</ul>
+<p>Examples::
+&gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+&gt;&gt;&gt; input = torch.randn(4, 2)
+&gt;&gt;&gt; output = m(input)</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CustomGLU(nn.Module):
+    &#34;&#34;&#34;Custom Gated Linear Unit activation.
+    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
+    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
+    function (i.e. sigmoid, swish, etc.).
+
+    Args:
+        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+        &gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+        &gt;&gt;&gt; input = torch.randn(4, 2)
+        &gt;&gt;&gt; output = m(input)
+    &#34;&#34;&#34;
+    def __init__(self, activation: nn.Module, dim: int = -1):
+        super(CustomGLU, self).__init__()
+        self.dim = dim
+        self.activation = activation
+
+    def forward(self, x: Tensor):
+        assert x.shape[self.dim] % 2 == 0  # M = N / 2
+        a, b = torch.chunk(x, 2, dim=self.dim)
+        return a * self.activation(b)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.GeGLU" href="#audiocraft.modules.activations.GeGLU">GeGLU</a></li>
+<li><a title="audiocraft.modules.activations.ReGLU" href="#audiocraft.modules.activations.ReGLU">ReGLU</a></li>
+<li><a title="audiocraft.modules.activations.SwiGLU" href="#audiocraft.modules.activations.SwiGLU">SwiGLU</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.CustomGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.CustomGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: Tensor):
+    assert x.shape[self.dim] % 2 == 0  # M = N / 2
+    a, b = torch.chunk(x, 2, dim=self.dim)
+    return a * self.activation(b)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU"><code class="flex name class">
+<span>class <span class="ident">GeGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>GeLU Gated Linear Unit activation.
+Applies GeLU Gated Linear Unit :math:<code>a * GELU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class GeGLU(CustomGLU):
+    &#34;&#34;&#34;GeLU Gated Linear Unit activation.
+    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(GeGLU, self).__init__(nn.GELU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.GeGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU"><code class="flex name class">
+<span>class <span class="ident">ReGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ReLU Gated Linear Unit activation.
+Applies ReLU Gated Linear Unit :math:<code>a * ReLU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ReGLU(CustomGLU):
+    &#34;&#34;&#34;ReLU Gated Linear Unit activation.
+    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(ReGLU, self).__init__(nn.ReLU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.ReGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU"><code class="flex name class">
+<span>class <span class="ident">SwiGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SiLU Gated Linear Unit activation.
+Applies SiLU Gated Linear Unit :math:<code>a * SiLU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SwiGLU(CustomGLU):
+    &#34;&#34;&#34;SiLU Gated Linear Unit activation.
+    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(SwiGLU, self).__init__(nn.SiLU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.SwiGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.get_activation_fn" href="#audiocraft.modules.activations.get_activation_fn">get_activation_fn</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.call_super_init" href="#audiocraft.modules.activations.CustomGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.dump_patches" href="#audiocraft.modules.activations.CustomGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.training" href="#audiocraft.modules.activations.CustomGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.GeGLU" href="#audiocraft.modules.activations.GeGLU">GeGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.GeGLU.call_super_init" href="#audiocraft.modules.activations.GeGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.GeGLU.dump_patches" href="#audiocraft.modules.activations.GeGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.GeGLU.training" href="#audiocraft.modules.activations.GeGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.ReGLU" href="#audiocraft.modules.activations.ReGLU">ReGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.ReGLU.call_super_init" href="#audiocraft.modules.activations.ReGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.ReGLU.dump_patches" href="#audiocraft.modules.activations.ReGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.ReGLU.training" href="#audiocraft.modules.activations.ReGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.SwiGLU" href="#audiocraft.modules.activations.SwiGLU">SwiGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.SwiGLU.call_super_init" href="#audiocraft.modules.activations.SwiGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.SwiGLU.dump_patches" href="#audiocraft.modules.activations.SwiGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.SwiGLU.training" href="#audiocraft.modules.activations.SwiGLU.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/chroma.html b/api_docs/audiocraft/modules/chroma.html
new file mode 100644
index 00000000..a309fff9
--- /dev/null
+++ b/api_docs/audiocraft/modules/chroma.html
@@ -0,0 +1,285 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.chroma API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.chroma</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+
+from einops import rearrange
+from librosa import filters
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+
+
+class ChromaExtractor(nn.Module):
+    &#34;&#34;&#34;Chroma extraction and quantization.
+
+    Args:
+        sample_rate (int): Sample rate for the chroma extraction.
+        n_chroma (int): Number of chroma bins for the chroma extraction.
+        radix2_exp (int): Size of stft window for the chroma extraction (power of 2, e.g. 12 -&gt; 2^12).
+        nfft (int, optional): Number of FFT.
+        winlen (int, optional): Window length.
+        winhop (int, optional): Window hop size.
+        argmax (bool, optional): Whether to use argmax. Defaults to False.
+        norm (float, optional): Norm for chroma normalization. Defaults to inf.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12, nfft: tp.Optional[int] = None,
+                 winlen: tp.Optional[int] = None, winhop: tp.Optional[int] = None, argmax: bool = False,
+                 norm: float = torch.inf):
+        super().__init__()
+        self.winlen = winlen or 2 ** radix2_exp
+        self.nfft = nfft or self.winlen
+        self.winhop = winhop or (self.winlen // 4)
+        self.sample_rate = sample_rate
+        self.n_chroma = n_chroma
+        self.norm = norm
+        self.argmax = argmax
+        self.register_buffer(&#39;fbanks&#39;, torch.from_numpy(filters.chroma(sr=sample_rate, n_fft=self.nfft, tuning=0,
+                                                                       n_chroma=self.n_chroma)), persistent=False)
+        self.spec = torchaudio.transforms.Spectrogram(n_fft=self.nfft, win_length=self.winlen,
+                                                      hop_length=self.winhop, power=2, center=True,
+                                                      pad=0, normalized=True)
+
+    def forward(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        T = wav.shape[-1]
+        # in case we are getting a wav that was dropped out (nullified)
+        # from the conditioner, make sure wav length is no less that nfft
+        if T &lt; self.nfft:
+            pad = self.nfft - T
+            r = 0 if pad % 2 == 0 else 1
+            wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+            assert wav.shape[-1] == self.nfft, f&#34;expected len {self.nfft} but got {wav.shape[-1]}&#34;
+
+        spec = self.spec(wav).squeeze(1)
+        raw_chroma = torch.einsum(&#39;cf,...ft-&gt;...ct&#39;, self.fbanks, spec)
+        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+        norm_chroma = rearrange(norm_chroma, &#39;b d t -&gt; b t d&#39;)
+
+        if self.argmax:
+            idx = norm_chroma.argmax(-1, keepdim=True)
+            norm_chroma[:] = 0
+            norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+        return norm_chroma</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.chroma.ChromaExtractor"><code class="flex name class">
+<span>class <span class="ident">ChromaExtractor</span></span>
+<span>(</span><span>sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12, nfft: Optional[int] = None, winlen: Optional[int] = None, winhop: Optional[int] = None, argmax: bool = False, norm: float = inf)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Chroma extraction and quantization.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate for the chroma extraction.</dd>
+<dt><strong><code>n_chroma</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of chroma bins for the chroma extraction.</dd>
+<dt><strong><code>radix2_exp</code></strong> :&ensp;<code>int</code></dt>
+<dd>Size of stft window for the chroma extraction (power of 2, e.g. 12 -&gt; 2^12).</dd>
+<dt><strong><code>nfft</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Number of FFT.</dd>
+<dt><strong><code>winlen</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Window length.</dd>
+<dt><strong><code>winhop</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Window hop size.</dd>
+<dt><strong><code>argmax</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to use argmax. Defaults to False.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Norm for chroma normalization. Defaults to inf.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ChromaExtractor(nn.Module):
+    &#34;&#34;&#34;Chroma extraction and quantization.
+
+    Args:
+        sample_rate (int): Sample rate for the chroma extraction.
+        n_chroma (int): Number of chroma bins for the chroma extraction.
+        radix2_exp (int): Size of stft window for the chroma extraction (power of 2, e.g. 12 -&gt; 2^12).
+        nfft (int, optional): Number of FFT.
+        winlen (int, optional): Window length.
+        winhop (int, optional): Window hop size.
+        argmax (bool, optional): Whether to use argmax. Defaults to False.
+        norm (float, optional): Norm for chroma normalization. Defaults to inf.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12, nfft: tp.Optional[int] = None,
+                 winlen: tp.Optional[int] = None, winhop: tp.Optional[int] = None, argmax: bool = False,
+                 norm: float = torch.inf):
+        super().__init__()
+        self.winlen = winlen or 2 ** radix2_exp
+        self.nfft = nfft or self.winlen
+        self.winhop = winhop or (self.winlen // 4)
+        self.sample_rate = sample_rate
+        self.n_chroma = n_chroma
+        self.norm = norm
+        self.argmax = argmax
+        self.register_buffer(&#39;fbanks&#39;, torch.from_numpy(filters.chroma(sr=sample_rate, n_fft=self.nfft, tuning=0,
+                                                                       n_chroma=self.n_chroma)), persistent=False)
+        self.spec = torchaudio.transforms.Spectrogram(n_fft=self.nfft, win_length=self.winlen,
+                                                      hop_length=self.winhop, power=2, center=True,
+                                                      pad=0, normalized=True)
+
+    def forward(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        T = wav.shape[-1]
+        # in case we are getting a wav that was dropped out (nullified)
+        # from the conditioner, make sure wav length is no less that nfft
+        if T &lt; self.nfft:
+            pad = self.nfft - T
+            r = 0 if pad % 2 == 0 else 1
+            wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+            assert wav.shape[-1] == self.nfft, f&#34;expected len {self.nfft} but got {wav.shape[-1]}&#34;
+
+        spec = self.spec(wav).squeeze(1)
+        raw_chroma = torch.einsum(&#39;cf,...ft-&gt;...ct&#39;, self.fbanks, spec)
+        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+        norm_chroma = rearrange(norm_chroma, &#39;b d t -&gt; b t d&#39;)
+
+        if self.argmax:
+            idx = norm_chroma.argmax(-1, keepdim=True)
+            norm_chroma[:] = 0
+            norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+        return norm_chroma</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.chroma.ChromaExtractor.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.chroma.ChromaExtractor.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.chroma.ChromaExtractor.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.chroma.ChromaExtractor.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, wav: torch.Tensor) -&gt; torch.Tensor:
+    T = wav.shape[-1]
+    # in case we are getting a wav that was dropped out (nullified)
+    # from the conditioner, make sure wav length is no less that nfft
+    if T &lt; self.nfft:
+        pad = self.nfft - T
+        r = 0 if pad % 2 == 0 else 1
+        wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+        assert wav.shape[-1] == self.nfft, f&#34;expected len {self.nfft} but got {wav.shape[-1]}&#34;
+
+    spec = self.spec(wav).squeeze(1)
+    raw_chroma = torch.einsum(&#39;cf,...ft-&gt;...ct&#39;, self.fbanks, spec)
+    norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+    norm_chroma = rearrange(norm_chroma, &#39;b d t -&gt; b t d&#39;)
+
+    if self.argmax:
+        idx = norm_chroma.argmax(-1, keepdim=True)
+        norm_chroma[:] = 0
+        norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+    return norm_chroma</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.chroma.ChromaExtractor" href="#audiocraft.modules.chroma.ChromaExtractor">ChromaExtractor</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.chroma.ChromaExtractor.call_super_init" href="#audiocraft.modules.chroma.ChromaExtractor.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.chroma.ChromaExtractor.dump_patches" href="#audiocraft.modules.chroma.ChromaExtractor.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.chroma.ChromaExtractor.forward" href="#audiocraft.modules.chroma.ChromaExtractor.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.chroma.ChromaExtractor.training" href="#audiocraft.modules.chroma.ChromaExtractor.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/codebooks_patterns.html b/api_docs/audiocraft/modules/codebooks_patterns.html
new file mode 100644
index 00000000..a3733feb
--- /dev/null
+++ b/api_docs/audiocraft/modules/codebooks_patterns.html
@@ -0,0 +1,1834 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.codebooks_patterns API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.codebooks_patterns</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import namedtuple
+from dataclasses import dataclass
+from functools import lru_cache
+import logging
+import typing as tp
+
+from abc import ABC, abstractmethod
+import torch
+
+LayoutCoord = namedtuple(&#39;LayoutCoord&#39;, [&#39;t&#39;, &#39;q&#39;])  # (timestep, codebook index)
+PatternLayout = tp.List[tp.List[LayoutCoord]]  # Sequence of coordinates
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Pattern:
+    &#34;&#34;&#34;Base implementation of a pattern over a sequence with multiple codebooks.
+
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    &#34;&#34;&#34;
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    n_q: int
+
+    def __post_init__(self):
+        assert len(self.layout) &gt; 0
+        assert self.layout[0] == []
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        logger.info(&#34;New pattern, time steps: %d, sequence steps: %d&#34;, self.timesteps, len(self.layout))
+
+    def _validate_layout(self):
+        &#34;&#34;&#34;Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        &#34;&#34;&#34;
+        q_timesteps = {q: 0 for q in range(self.n_q)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) &gt; 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    assert coord.t &gt;= last_q_timestep, \
+                        f&#34;Past timesteps are found in the sequence for codebook = {coord.q} at step {s}&#34;
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(seq_coords), \
+                    f&#34;Multiple entries for a same codebook are found at step {s}&#34;
+
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+
+    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        &#34;&#34;&#34;
+        assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+        if q is not None:
+            assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+
+    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+
+    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None
+
+    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
+                                                device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        &#34;&#34;&#34;
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert timesteps &lt;= self.timesteps, &#34;invalid number of timesteps used to build the sequence from the pattern&#34;
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: n_q * timesteps
+        indexes[:] = n_q * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t &lt; timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        &#34;&#34;&#34;
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
+                                                 keep_only_valid_steps: bool = False,
+                                                 is_model_output: bool = False,
+                                                 device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+
+        Args:
+            sequence_steps (int): Sequence steps.
+            n_q (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
+        timesteps = self.timesteps
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert sequence_steps &lt;= len(ref_layout), \
+            f&#34;sequence to revert is longer than the defined pattern: {sequence_steps} &gt; {len(ref_layout)}&#34;
+
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output:
+            ref_layout = ref_layout[1:]
+
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = n_q * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s &lt; sequence_steps:
+                for code in sequence_codes:
+                    if code.t &lt; timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        &#34;&#34;&#34;
+        B, card, K, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, card, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, card, K, indexes.shape[-1])
+        return values, indexes, mask
+
+
+class CodebooksPatternProvider(ABC):
+    &#34;&#34;&#34;Abstraction around providing pattern for interleaving codebooks.
+
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `n_q`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+
+    Args:
+        n_q (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, cached: bool = True):
+        assert n_q &gt; 0
+        self.n_q = n_q
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+        Args:
+            timesteps (int): Total number of timesteps.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class DelayedPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+
+    Example:
+        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None,
+                 flatten_first: int = 0, empty_initial: int = 0):
+        super().__init__(n_q)
+        if delays is None:
+            delays = list(range(n_q))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.n_q
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.n_q):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class ParallelPatternProvider(DelayedPatternProvider):
+    &#34;&#34;&#34;Provider for parallel pattern across codebooks.
+    This pattern provider is a special case of the delayed pattern with actually no delay,
+    hence delays=repeat(0, n_q).
+
+    Args:
+        n_q (int): Number of codebooks.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int):
+        super().__init__(n_q, [0] * n_q)
+
+
+class UnrolledPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for unrolling codebooks pattern.
+    This pattern provider enables to represent the codebook flattened completely or only to some extend
+    while also specifying a given delay between the flattened codebooks representation, allowing to
+    unroll the codebooks in the sequence.
+
+    Example:
+        1. Flattening of the codebooks.
+        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+        taking n_q = 3 and timesteps = 4:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
+        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
+        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+        and delays = [0, 3, 3]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, S, 1, S, 2, S, 3, S, 4],
+         [S, S, S, 1, S, 2, S, 3, S, 4],
+         [1, 2, 3, S, 4, S, 5, S, 6, S]]
+
+    Args:
+        n_q (int): Number of codebooks.
+        flattening (list of int, optional): Flattening schema over the codebooks. If not defined,
+            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+            have n_q extra steps for each timestep.
+        delays (list of int, optional): Delay for each of the codebooks. If not defined,
+            no delay is added and therefore will default to [0] * ``n_q``.
+            Note that two codebooks that will be flattened to the same inner step
+            should have the same delay, otherwise the pattern is considered as invalid.
+    &#34;&#34;&#34;
+    FlattenedCodebook = namedtuple(&#39;FlattenedCodebook&#39;, [&#39;codebooks&#39;, &#39;delay&#39;])
+
+    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
+                 delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if flattening is None:
+            flattening = list(range(n_q))
+        if delays is None:
+            delays = [0] * n_q
+        assert len(flattening) == n_q
+        assert len(delays) == n_q
+        assert sorted(flattening) == flattening
+        assert sorted(delays) == delays
+        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
+        self.max_delay = max(delays)
+
+    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
+        &#34;&#34;&#34;Build a flattened codebooks representation as a dictionary of inner step
+        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
+        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
+        &#34;&#34;&#34;
+        flattened_codebooks: dict = {}
+        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
+            if inner_step not in flattened_codebooks:
+                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
+            else:
+                flat_codebook = flattened_codebooks[inner_step]
+                assert flat_codebook.delay == delay, (
+                    &#34;Delay and flattening between codebooks is inconsistent: &#34;,
+                    &#34;two codebooks flattened to the same position should have the same delay.&#34;
+                )
+                flat_codebook.codebooks.append(q)
+            flattened_codebooks[inner_step] = flat_codebook
+        return flattened_codebooks
+
+    @property
+    def _num_inner_steps(self):
+        &#34;&#34;&#34;Number of inner steps to unroll between timesteps in order to flatten the codebooks.
+        &#34;&#34;&#34;
+        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
+
+    def num_virtual_steps(self, timesteps: int) -&gt; int:
+        return timesteps * self._num_inner_steps + 1
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+        Args:
+            timesteps (int): Total number of timesteps.
+        &#34;&#34;&#34;
+        # the PatternLayout is built as a tuple of sequence position and list of coordinates
+        # so that it can be reordered properly given the required delay between codebooks of given timesteps
+        indexed_out: list = [(-1, [])]
+        max_timesteps = timesteps + self.max_delay
+        for t in range(max_timesteps):
+            # for each timestep, we unroll the flattened codebooks,
+            # emitting the sequence step with the corresponding delay
+            for step in range(self._num_inner_steps):
+                if step in self._flattened_codebooks:
+                    # we have codebooks at this virtual step to emit
+                    step_codebooks = self._flattened_codebooks[step]
+                    t_for_q = t + step_codebooks.delay
+                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                    if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                        indexed_out.append((t_for_q, coords))
+                else:
+                    # there is no codebook in this virtual step so we emit an empty list
+                    indexed_out.append((t, []))
+        out = [coords for _, coords in sorted(indexed_out)]
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class CoarseFirstPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
+    potentially with delays.
+
+    ..Warning:: You must always generate the full training duration at test time, for instance,
+        30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
+        location. This is due to the non causality of the remaining codebooks with respect to
+        the first ones.
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if delays is None:
+            delays = [0] * (n_q - 1)
+        self.delays = delays
+        assert len(self.delays) == self.n_q - 1
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for t in range(timesteps):
+            out.append([LayoutCoord(t, 0)])
+        max_delay = max(self.delays)
+        for t in range(timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= 0:
+                    v.append(LayoutCoord(t_for_q, q + 1))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class MusicLMPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost MusicLM style pattern. This is equivalent to full flattening
+    but in a different order.
+
+    Args:
+        n_q (int): Number of codebooks.
+        group_by (int): Number of codebooks to group together.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, group_by: int = 2):
+        super().__init__(n_q)
+        self.group_by = group_by
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for offset in range(0, self.n_q, self.group_by):
+            for t in range(timesteps):
+                for q in range(offset, offset + self.group_by):
+                    out.append([LayoutCoord(t, q)])
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.CoarseFirstPattern"><code class="flex name class">
+<span>class <span class="ident">CoarseFirstPattern</span></span>
+<span>(</span><span>n_q: int, delays: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
+potentially with delays.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;You must always generate the full training duration at test time, for instance,</p>
+<p>30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
+location. This is due to the non causality of the remaining codebooks with respect to
+the first ones.</p>
+</div>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>list</code> of <code>int</code>, optional</dt>
+<dd>Delay for each of the codebooks.
+If delays not defined, each codebook is delayed by 1 compared to the previous one.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CoarseFirstPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
+    potentially with delays.
+
+    ..Warning:: You must always generate the full training duration at test time, for instance,
+        30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
+        location. This is due to the non causality of the remaining codebooks with respect to
+        the first ones.
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if delays is None:
+            delays = [0] * (n_q - 1)
+        self.delays = delays
+        assert len(self.delays) == self.n_q - 1
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for t in range(timesteps):
+            out.append([LayoutCoord(t, 0)])
+        max_delay = max(self.delays)
+        for t in range(timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= 0:
+                    v.append(LayoutCoord(t_for_q, q + 1))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider"><code class="flex name class">
+<span>class <span class="ident">CodebooksPatternProvider</span></span>
+<span>(</span><span>n_q: int, cached: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Abstraction around providing pattern for interleaving codebooks.</p>
+<p>The CodebooksPatternProvider abstraction allows to implement various strategies to
+define interleaving pattern of sequences composed of multiple codebooks. For a given
+number of codebooks <code>n_q</code>, the pattern provider can generate a specified pattern
+corresponding to a sequence of <code>T</code> timesteps with <code>n_q</code> parallel codebooks. This pattern
+can be used to construct a new sequence from the original codes respecting the specified
+pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+being a tuple with the original timestep and codebook to build the new sequence.
+Note that all patterns must start with an empty list that is then used to insert a first
+sequence step of special tokens in the newly generated sequence.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of codebooks.</dd>
+<dt><strong><code>cached</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if True, patterns for a given length are cached. In general
+that should be true for efficiency reason to avoid synchronization points.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CodebooksPatternProvider(ABC):
+    &#34;&#34;&#34;Abstraction around providing pattern for interleaving codebooks.
+
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `n_q`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+
+    Args:
+        n_q (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, cached: bool = True):
+        assert n_q &gt; 0
+        self.n_q = n_q
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+        Args:
+            timesteps (int): Total number of timesteps.
+        &#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CoarseFirstPattern" href="#audiocraft.modules.codebooks_patterns.CoarseFirstPattern">CoarseFirstPattern</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.MusicLMPattern" href="#audiocraft.modules.codebooks_patterns.MusicLMPattern">MusicLMPattern</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider">UnrolledPatternProvider</a></li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern"><code class="name flex">
+<span>def <span class="ident">get_pattern</span></span>(<span>self, timesteps: int) ‑> <a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Builds pattern with specific interleaving between codebooks.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>timesteps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total number of timesteps.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def get_pattern(self, timesteps: int) -&gt; Pattern:
+    &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+    Args:
+        timesteps (int): Total number of timesteps.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.DelayedPatternProvider"><code class="flex name class">
+<span>class <span class="ident">DelayedPatternProvider</span></span>
+<span>(</span><span>n_q: int, delays: Optional[List[int]] = None, flatten_first: int = 0, empty_initial: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for delayed pattern across delayed codebooks.
+Codebooks are delayed in the sequence and sequence steps will contain codebooks
+from different timesteps.</p>
+<h2 id="example">Example</h2>
+<p>Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+The resulting sequence obtained from the returned pattern is:
+[[S, 1, 2, 3, 4],
+[S, S, 1, 2, 3],
+[S, S, S, 1, 2]]
+(with S being a special token)</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>list</code> of <code>int</code>, optional</dt>
+<dd>Delay for each of the codebooks.
+If delays not defined, each codebook is delayed by 1 compared to the previous one.</dd>
+<dt><strong><code>flatten_first</code></strong> :&ensp;<code>int</code></dt>
+<dd>Flatten the first N timesteps.</dd>
+<dt><strong><code>empty_initial</code></strong> :&ensp;<code>int</code></dt>
+<dd>Prepend with N empty list of coordinates.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DelayedPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+
+    Example:
+        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None,
+                 flatten_first: int = 0, empty_initial: int = 0):
+        super().__init__(n_q)
+        if delays is None:
+            delays = list(range(n_q))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.n_q
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.n_q):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.ParallelPatternProvider" href="#audiocraft.modules.codebooks_patterns.ParallelPatternProvider">ParallelPatternProvider</a></li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord"><code class="flex name class">
+<span>class <span class="ident">LayoutCoord</span></span>
+<span>(</span><span>t, q)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LayoutCoord(t, q)</p></div>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord.q"><code class="name">var <span class="ident">q</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord.t"><code class="name">var <span class="ident">t</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.MusicLMPattern"><code class="flex name class">
+<span>class <span class="ident">MusicLMPattern</span></span>
+<span>(</span><span>n_q: int, group_by: int = 2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Almost MusicLM style pattern. This is equivalent to full flattening
+but in a different order.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>group_by</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks to group together.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicLMPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost MusicLM style pattern. This is equivalent to full flattening
+    but in a different order.
+
+    Args:
+        n_q (int): Number of codebooks.
+        group_by (int): Number of codebooks to group together.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, group_by: int = 2):
+        super().__init__(n_q)
+        self.group_by = group_by
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for offset in range(0, self.n_q, self.group_by):
+            for t in range(timesteps):
+                for q in range(offset, offset + self.group_by):
+                    out.append([LayoutCoord(t, q)])
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.ParallelPatternProvider"><code class="flex name class">
+<span>class <span class="ident">ParallelPatternProvider</span></span>
+<span>(</span><span>n_q: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for parallel pattern across codebooks.
+This pattern provider is a special case of the delayed pattern with actually no delay,
+hence delays=repeat(0, n_q).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ParallelPatternProvider(DelayedPatternProvider):
+    &#34;&#34;&#34;Provider for parallel pattern across codebooks.
+    This pattern provider is a special case of the delayed pattern with actually no delay,
+    hence delays=repeat(0, n_q).
+
+    Args:
+        n_q (int): Number of codebooks.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int):
+        super().__init__(n_q, [0] * n_q)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern"><code class="flex name class">
+<span>class <span class="ident">Pattern</span></span>
+<span>(</span><span>layout: List[List[<a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a>]], timesteps: int, n_q: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base implementation of a pattern over a sequence with multiple codebooks.</p>
+<p>The codebook pattern consists in a layout, defining for each sequence step
+the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+The first item of the pattern is always an empty list in order to properly insert a special token
+to start with. For convenience, we also keep track of <code>n_q</code> the number of codebooks used for the pattern
+and <code>timesteps</code> the number of timesteps corresponding to the original sequence.</p>
+<p>The pattern provides convenient methods to build and revert interleaved sequences from it:
+<code>build_pattern_sequence</code> maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+is returned along with a mask indicating valid tokens.
+<code>revert_pattern_sequence</code> maps back an interleaved sequence of shape [B, K, S] to the original alignment
+of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+to fill and specify invalid positions if needed.
+See the dedicated methods for more details.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Pattern:
+    &#34;&#34;&#34;Base implementation of a pattern over a sequence with multiple codebooks.
+
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    &#34;&#34;&#34;
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    n_q: int
+
+    def __post_init__(self):
+        assert len(self.layout) &gt; 0
+        assert self.layout[0] == []
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        logger.info(&#34;New pattern, time steps: %d, sequence steps: %d&#34;, self.timesteps, len(self.layout))
+
+    def _validate_layout(self):
+        &#34;&#34;&#34;Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        &#34;&#34;&#34;
+        q_timesteps = {q: 0 for q in range(self.n_q)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) &gt; 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    assert coord.t &gt;= last_q_timestep, \
+                        f&#34;Past timesteps are found in the sequence for codebook = {coord.q} at step {s}&#34;
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(seq_coords), \
+                    f&#34;Multiple entries for a same codebook are found at step {s}&#34;
+
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+
+    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        &#34;&#34;&#34;
+        assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+        if q is not None:
+            assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+
+    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+
+    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None
+
+    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
+                                                device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        &#34;&#34;&#34;
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert timesteps &lt;= self.timesteps, &#34;invalid number of timesteps used to build the sequence from the pattern&#34;
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: n_q * timesteps
+        indexes[:] = n_q * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t &lt; timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        &#34;&#34;&#34;
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
+                                                 keep_only_valid_steps: bool = False,
+                                                 is_model_output: bool = False,
+                                                 device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+
+        Args:
+            sequence_steps (int): Sequence steps.
+            n_q (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
+        timesteps = self.timesteps
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert sequence_steps &lt;= len(ref_layout), \
+            f&#34;sequence to revert is longer than the defined pattern: {sequence_steps} &gt; {len(ref_layout)}&#34;
+
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output:
+            ref_layout = ref_layout[1:]
+
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = n_q * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s &lt; sequence_steps:
+                for code in sequence_codes:
+                    if code.t &lt; timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        &#34;&#34;&#34;
+        B, card, K, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, card, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, card, K, indexes.shape[-1])
+        return values, indexes, mask</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.layout"><code class="name">var <span class="ident">layout</span> : List[List[<a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a>]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.n_q"><code class="name">var <span class="ident">n_q</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.timesteps"><code class="name">var <span class="ident">timesteps</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.max_delay"><code class="name">var <span class="ident">max_delay</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def max_delay(self):
+    max_t_in_seq_coords = 0
+    for seq_coords in self.layout[1:]:
+        for coords in seq_coords:
+            max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+    return max_t_in_seq_coords - self.timesteps</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps"><code class="name">var <span class="ident">num_sequence_steps</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_sequence_steps(self):
+    return len(self.layout) - 1</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.valid_layout"><code class="name">var <span class="ident">valid_layout</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def valid_layout(self):
+    valid_step = len(self.layout) - self.max_delay
+    return self.layout[:valid_step]</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence"><code class="name flex">
+<span>def <span class="ident">build_pattern_sequence</span></span>(<span>self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build sequence corresponding to the pattern from the input tensor z.
+The sequence is built using up to sequence_steps if specified, and non-pattern
+coordinates are filled with the special token.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>z</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input tensor of multi-codebooks sequence, of shape [B, K, T].</dd>
+<dt><strong><code>special_token</code></strong> :&ensp;<code>int</code></dt>
+<dd>Special token used to fill non-pattern coordinates in the new sequence.</dd>
+<dt><strong><code>keep_only_valid_steps</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Build a sequence from the pattern up to valid (= fully defined) steps.
+Steps that are beyond valid steps will be replaced by the special_token in that case.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+    The sequence is built using up to sequence_steps if specified, and non-pattern
+    coordinates are filled with the special token.
+
+    Args:
+        z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+        special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+        keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+            Steps that are beyond valid steps will be replaced by the special_token in that case.
+    Returns:
+        values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+            corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+        indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+        mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+    &#34;&#34;&#34;
+    B, K, T = z.shape
+    indexes, mask = self._build_pattern_sequence_scatter_indexes(
+        T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+    )
+    z = z.view(B, -1)
+    # we append the special token as the last index of our flattened z tensor
+    z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+    values = z[:, indexes.view(-1)]
+    values = values.view(B, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps"><code class="name flex">
+<span>def <span class="ident">get_first_step_with_timesteps</span></span>(<span>self, t: int, q: Optional[int] = None) ‑> Optional[int]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+    steps_with_timesteps = self.get_steps_with_timestep(t, q)
+    return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep"><code class="name flex">
+<span>def <span class="ident">get_sequence_coords_with_timestep</span></span>(<span>self, t: int, q: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get codebook coordinates in the layout that corresponds to the specified timestep t
+and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+and the actual codebook coordinates.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+    &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+    and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+    and the actual codebook coordinates.
+    &#34;&#34;&#34;
+    assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+    if q is not None:
+        assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+    coords = []
+    for s, seq_codes in enumerate(self.layout):
+        for code in seq_codes:
+            if code.t == t and (q is None or code.q == q):
+                coords.append((s, code))
+    return coords</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep"><code class="name flex">
+<span>def <span class="ident">get_steps_with_timestep</span></span>(<span>self, t: int, q: Optional[int] = None) ‑> List[int]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+    return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits"><code class="name flex">
+<span>def <span class="ident">revert_pattern_logits</span></span>(<span>self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Revert model logits obtained on a sequence built from the pattern
+back to a tensor matching the original sequence.</p>
+<p>This method is similar to <code>revert_pattern_sequence</code> with the following specificities:
+1. It is designed to work with the extra cardinality dimension
+2. We return the logits for the first sequence item that matches the special_token and
+which matching target in the original sequence is the first item of the sequence,
+while we skip the last logits as there is no matching target</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+    back to a tensor matching the original sequence.
+
+    This method is similar to ``revert_pattern_sequence`` with the following specificities:
+    1. It is designed to work with the extra cardinality dimension
+    2. We return the logits for the first sequence item that matches the special_token and
+    which matching target in the original sequence is the first item of the sequence,
+    while we skip the last logits as there is no matching target
+    &#34;&#34;&#34;
+    B, card, K, S = logits.shape
+    indexes, mask = self._build_reverted_sequence_scatter_indexes(
+        S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+    )
+    logits = logits.reshape(B, card, -1)
+    # we append the special token as the last index of our flattened z tensor
+    logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+    values = logits[:, :, indexes.view(-1)]
+    values = values.view(B, card, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence"><code class="name flex">
+<span>def <span class="ident">revert_pattern_sequence</span></span>(<span>self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+are filled with the special token.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>s</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].</dd>
+<dt><strong><code>special_token</code></strong> :&ensp;<code>int</code> or <code>float</code></dt>
+<dd>Special token used to fill non-pattern coordinates in the new sequence.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+    The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+    are filled with the special token.
+
+    Args:
+        s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+        special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+    Returns:
+        values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+            corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+        indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+        mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+    &#34;&#34;&#34;
+    B, K, S = s.shape
+    indexes, mask = self._build_reverted_sequence_scatter_indexes(
+        S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+    )
+    s = s.view(B, -1)
+    # we append the special token as the last index of our flattened z tensor
+    s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+    values = s[:, indexes.view(-1)]
+    values = values.view(B, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider"><code class="flex name class">
+<span>class <span class="ident">UnrolledPatternProvider</span></span>
+<span>(</span><span>n_q: int, flattening: Optional[List[int]] = None, delays: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for unrolling codebooks pattern.
+This pattern provider enables to represent the codebook flattened completely or only to some extend
+while also specifying a given delay between the flattened codebooks representation, allowing to
+unroll the codebooks in the sequence.</p>
+<h2 id="example">Example</h2>
+<ol>
+<li>Flattening of the codebooks.
+By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+taking n_q = 3 and timesteps = 4:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[1, S, S, 2, S, S, 3, S, S, 4, S, S]]</li>
+<li>Partial flattening of the codebooks. The <code>flattening</code> parameter allows to specify the inner step
+for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[1, S, S, 2, S, S, 3, S, S, 4, S, S]]</li>
+<li>Flattening with delay. The <code>delay</code> parameter allows to further unroll the sequence of codebooks
+allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+and delays = [0, 3, 3]:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, S, S, 1, S, 2, S, 3, S, 4],
+[S, S, S, 1, S, 2, S, 3, S, 4],
+[1, 2, 3, S, 4, S, 5, S, 6, S]]</li>
+</ol>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>flattening</code></strong> :&ensp;<code>list</code> of <code>int</code>, optional</dt>
+<dd>Flattening schema over the codebooks. If not defined,
+the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+have n_q extra steps for each timestep.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>list</code> of <code>int</code>, optional</dt>
+<dd>Delay for each of the codebooks. If not defined,
+no delay is added and therefore will default to [0] * <code>n_q</code>.
+Note that two codebooks that will be flattened to the same inner step
+should have the same delay, otherwise the pattern is considered as invalid.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class UnrolledPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for unrolling codebooks pattern.
+    This pattern provider enables to represent the codebook flattened completely or only to some extend
+    while also specifying a given delay between the flattened codebooks representation, allowing to
+    unroll the codebooks in the sequence.
+
+    Example:
+        1. Flattening of the codebooks.
+        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+        taking n_q = 3 and timesteps = 4:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
+        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
+        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+        and delays = [0, 3, 3]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, S, 1, S, 2, S, 3, S, 4],
+         [S, S, S, 1, S, 2, S, 3, S, 4],
+         [1, 2, 3, S, 4, S, 5, S, 6, S]]
+
+    Args:
+        n_q (int): Number of codebooks.
+        flattening (list of int, optional): Flattening schema over the codebooks. If not defined,
+            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+            have n_q extra steps for each timestep.
+        delays (list of int, optional): Delay for each of the codebooks. If not defined,
+            no delay is added and therefore will default to [0] * ``n_q``.
+            Note that two codebooks that will be flattened to the same inner step
+            should have the same delay, otherwise the pattern is considered as invalid.
+    &#34;&#34;&#34;
+    FlattenedCodebook = namedtuple(&#39;FlattenedCodebook&#39;, [&#39;codebooks&#39;, &#39;delay&#39;])
+
+    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
+                 delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if flattening is None:
+            flattening = list(range(n_q))
+        if delays is None:
+            delays = [0] * n_q
+        assert len(flattening) == n_q
+        assert len(delays) == n_q
+        assert sorted(flattening) == flattening
+        assert sorted(delays) == delays
+        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
+        self.max_delay = max(delays)
+
+    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
+        &#34;&#34;&#34;Build a flattened codebooks representation as a dictionary of inner step
+        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
+        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
+        &#34;&#34;&#34;
+        flattened_codebooks: dict = {}
+        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
+            if inner_step not in flattened_codebooks:
+                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
+            else:
+                flat_codebook = flattened_codebooks[inner_step]
+                assert flat_codebook.delay == delay, (
+                    &#34;Delay and flattening between codebooks is inconsistent: &#34;,
+                    &#34;two codebooks flattened to the same position should have the same delay.&#34;
+                )
+                flat_codebook.codebooks.append(q)
+            flattened_codebooks[inner_step] = flat_codebook
+        return flattened_codebooks
+
+    @property
+    def _num_inner_steps(self):
+        &#34;&#34;&#34;Number of inner steps to unroll between timesteps in order to flatten the codebooks.
+        &#34;&#34;&#34;
+        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
+
+    def num_virtual_steps(self, timesteps: int) -&gt; int:
+        return timesteps * self._num_inner_steps + 1
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+        Args:
+            timesteps (int): Total number of timesteps.
+        &#34;&#34;&#34;
+        # the PatternLayout is built as a tuple of sequence position and list of coordinates
+        # so that it can be reordered properly given the required delay between codebooks of given timesteps
+        indexed_out: list = [(-1, [])]
+        max_timesteps = timesteps + self.max_delay
+        for t in range(max_timesteps):
+            # for each timestep, we unroll the flattened codebooks,
+            # emitting the sequence step with the corresponding delay
+            for step in range(self._num_inner_steps):
+                if step in self._flattened_codebooks:
+                    # we have codebooks at this virtual step to emit
+                    step_codebooks = self._flattened_codebooks[step]
+                    t_for_q = t + step_codebooks.delay
+                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                    if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                        indexed_out.append((t_for_q, coords))
+                else:
+                    # there is no codebook in this virtual step so we emit an empty list
+                    indexed_out.append((t, []))
+        out = [coords for _, coords in sorted(indexed_out)]
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook"><code class="name">var <span class="ident">FlattenedCodebook</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern"><code class="name flex">
+<span>def <span class="ident">get_pattern</span></span>(<span>self, timesteps: int) ‑> <a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Builds pattern for delay across codebooks.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>timesteps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total number of timesteps.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_pattern(self, timesteps: int) -&gt; Pattern:
+    &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+    Args:
+        timesteps (int): Total number of timesteps.
+    &#34;&#34;&#34;
+    # the PatternLayout is built as a tuple of sequence position and list of coordinates
+    # so that it can be reordered properly given the required delay between codebooks of given timesteps
+    indexed_out: list = [(-1, [])]
+    max_timesteps = timesteps + self.max_delay
+    for t in range(max_timesteps):
+        # for each timestep, we unroll the flattened codebooks,
+        # emitting the sequence step with the corresponding delay
+        for step in range(self._num_inner_steps):
+            if step in self._flattened_codebooks:
+                # we have codebooks at this virtual step to emit
+                step_codebooks = self._flattened_codebooks[step]
+                t_for_q = t + step_codebooks.delay
+                coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                    indexed_out.append((t_for_q, coords))
+            else:
+                # there is no codebook in this virtual step so we emit an empty list
+                indexed_out.append((t, []))
+    out = [coords for _, coords in sorted(indexed_out)]
+    return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps"><code class="name flex">
+<span>def <span class="ident">num_virtual_steps</span></span>(<span>self, timesteps: int) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def num_virtual_steps(self, timesteps: int) -&gt; int:
+    return timesteps * self._num_inner_steps + 1</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.CoarseFirstPattern" href="#audiocraft.modules.codebooks_patterns.CoarseFirstPattern">CoarseFirstPattern</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord.q" href="#audiocraft.modules.codebooks_patterns.LayoutCoord.q">q</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord.t" href="#audiocraft.modules.codebooks_patterns.LayoutCoord.t">t</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.MusicLMPattern" href="#audiocraft.modules.codebooks_patterns.MusicLMPattern">MusicLMPattern</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.ParallelPatternProvider" href="#audiocraft.modules.codebooks_patterns.ParallelPatternProvider">ParallelPatternProvider</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence" href="#audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence">build_pattern_sequence</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps" href="#audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps">get_first_step_with_timesteps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep" href="#audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep">get_sequence_coords_with_timestep</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep" href="#audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep">get_steps_with_timestep</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.layout" href="#audiocraft.modules.codebooks_patterns.Pattern.layout">layout</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.max_delay" href="#audiocraft.modules.codebooks_patterns.Pattern.max_delay">max_delay</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.n_q" href="#audiocraft.modules.codebooks_patterns.Pattern.n_q">n_q</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps" href="#audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps">num_sequence_steps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits" href="#audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits">revert_pattern_logits</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence" href="#audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence">revert_pattern_sequence</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.timesteps" href="#audiocraft.modules.codebooks_patterns.Pattern.timesteps">timesteps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.valid_layout" href="#audiocraft.modules.codebooks_patterns.Pattern.valid_layout">valid_layout</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider">UnrolledPatternProvider</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook">FlattenedCodebook</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern">get_pattern</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps">num_virtual_steps</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/conditioners.html b/api_docs/audiocraft/modules/conditioners.html
new file mode 100644
index 00000000..cf92efc8
--- /dev/null
+++ b/api_docs/audiocraft/modules/conditioners.html
@@ -0,0 +1,4667 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.conditioners API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.conditioners</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from itertools import chain
+import logging
+import math
+from pathlib import Path
+import random
+import re
+import typing as tp
+import warnings
+
+import einops
+from num2words import num2words
+import spacy
+from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from .chroma import ChromaExtractor
+from .streaming import StreamingModule
+from .transformer import create_sin_embedding
+from ..data.audio import audio_read
+from ..data.audio_dataset import SegmentInfo
+from ..data.audio_utils import convert_audio
+from ..environment import AudioCraftEnvironment
+from ..quantization import ResidualVectorQuantizer
+from ..utils.autocast import TorchAutocast
+from ..utils.cache import EmbeddingCache
+from ..utils.utils import collate, hash_trick, length_to_mask, load_clap_state_dict, warn_once
+
+
+logger = logging.getLogger(__name__)
+TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn&#39;t exist)
+ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
+
+
+class WavCondition(tp.NamedTuple):
+    wav: torch.Tensor
+    length: torch.Tensor
+    sample_rate: tp.List[int]
+    path: tp.List[tp.Optional[str]] = []
+    seek_time: tp.List[tp.Optional[float]] = []
+
+
+class JointEmbedCondition(tp.NamedTuple):
+    wav: torch.Tensor
+    text: tp.List[tp.Optional[str]]
+    length: torch.Tensor
+    sample_rate: tp.List[int]
+    path: tp.List[tp.Optional[str]] = []
+    seek_time: tp.List[tp.Optional[float]] = []
+
+
+@dataclass
+class ConditioningAttributes:
+    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    @property
+    def text_attributes(self):
+        return self.text.keys()
+
+    @property
+    def wav_attributes(self):
+        return self.wav.keys()
+
+    @property
+    def joint_embed_attributes(self):
+        return self.joint_embed.keys()
+
+    @property
+    def attributes(self):
+        return {
+            &#34;text&#34;: self.text_attributes,
+            &#34;wav&#34;: self.wav_attributes,
+            &#34;joint_embed&#34;: self.joint_embed_attributes,
+        }
+
+    def to_flat_dict(self):
+        return {
+            **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+            **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+            **{f&#34;joint_embed.{k}&#34;: v for k, v in self.joint_embed.items()}
+        }
+
+    @classmethod
+    def from_flat_dict(cls, x):
+        out = cls()
+        for k, v in x.items():
+            kind, att = k.split(&#34;.&#34;)
+            out[kind][att] = v
+        return out
+
+
+class SegmentWithAttributes(SegmentInfo):
+    &#34;&#34;&#34;Base class for all dataclasses that are used for conditioning.
+    All child classes should implement `to_condition_attributes` that converts
+    the existing attributes to a dataclass of type ConditioningAttributes.
+    &#34;&#34;&#34;
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        raise NotImplementedError()
+
+
+def nullify_condition(condition: ConditionType, dim: int = 1):
+    &#34;&#34;&#34;Transform an input condition to a null condition.
+    The way it is done by converting it to a single zero vector similarly
+    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
+
+    Args:
+        condition (ConditionType): A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])
+        dim (int): The dimension that will be truncated (should be the time dimension)
+        WARNING!: dim should not be the batch dimension!
+    Returns:
+        ConditionType: A tuple of null condition and mask
+    &#34;&#34;&#34;
+    assert dim != 0, &#34;dim cannot be the batch dimension!&#34;
+    assert isinstance(condition, tuple) and \
+        isinstance(condition[0], torch.Tensor) and \
+        isinstance(condition[1], torch.Tensor), &#34;&#39;nullify_condition&#39; got an unexpected input type!&#34;
+    cond, mask = condition
+    B = cond.shape[0]
+    last_dim = cond.dim() - 1
+    out = cond.transpose(dim, last_dim)
+    out = 0. * out[..., :1]
+    out = out.transpose(dim, last_dim)
+    mask = torch.zeros((B, 1), device=out.device).int()
+    assert cond.dim() == out.dim()
+    return out, mask
+
+
+def nullify_wav(cond: WavCondition) -&gt; WavCondition:
+    &#34;&#34;&#34;Transform a WavCondition to a nullified WavCondition.
+    It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.
+
+    Args:
+        cond (WavCondition): Wav condition with wav, tensor of shape [B, T].
+    Returns:
+        WavCondition: Nullified wav condition.
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((cond.wav, torch.zeros_like(cond.wav)), dim=cond.wav.dim() - 1)
+    return WavCondition(
+        wav=null_wav,
+        length=torch.tensor([0] * cond.wav.shape[0], device=cond.wav.device),
+        sample_rate=cond.sample_rate,
+        path=[None] * cond.wav.shape[0],
+        seek_time=[None] * cond.wav.shape[0],
+    )
+
+
+def nullify_joint_embed(embed: JointEmbedCondition) -&gt; JointEmbedCondition:
+    &#34;&#34;&#34;Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
+    and replacing metadata by dummy attributes.
+
+    Args:
+        cond (JointEmbedCondition): Joint embedding condition with wav and text, wav tensor of shape [B, C, T].
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((embed.wav, torch.zeros_like(embed.wav)), dim=embed.wav.dim() - 1)
+    return JointEmbedCondition(
+        wav=null_wav, text=[None] * len(embed.text),
+        length=torch.LongTensor([0]).to(embed.wav.device),
+        sample_rate=embed.sample_rate,
+        path=[None] * embed.wav.shape[0],
+        seek_time=[0] * embed.wav.shape[0],
+    )
+
+
+class Tokenizer:
+    &#34;&#34;&#34;Base tokenizer implementation
+    (in case we want to introduce more advances tokenizers in the future).
+    &#34;&#34;&#34;
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()
+
+
+class WhiteSpaceTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for natural language descriptions.
+    For example:
+    [&#34;he didn&#39;t, know he&#39;s going home.&#34;, &#39;shorter sentence&#39;] =&gt;
+    [[78, 62, 31,  4, 78, 25, 19, 34],
+    [59, 77,  0,  0,  0,  0,  0,  0]]
+    &#34;&#34;&#34;
+    PUNCTUATION = &#34;?:!.,;&#34;
+
+    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = &#34;en_core_web_sm&#34;,
+                 lemma: bool = True, stopwords: bool = True) -&gt; None:
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+        self.lemma = lemma
+        self.stopwords = stopwords
+        try:
+            self.nlp = spacy.load(language)
+        except IOError:
+            spacy.cli.download(language)  # type: ignore
+            self.nlp = spacy.load(language)
+
+    @tp.no_type_check
+    def __call__(self, texts: tp.List[tp.Optional[str]],
+                 return_text: bool = False) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Take a list of strings and convert them to a tensor of indices.
+
+        Args:
+            texts (list[str]): List of strings.
+            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]:
+                - Indices of words in the LUT.
+                - And a mask indicating where the padding tokens are
+        &#34;&#34;&#34;
+        output, lengths = [], []
+        texts = deepcopy(texts)
+        for i, text in enumerate(texts):
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(torch.Tensor([self.pad_idx]))
+                lengths.append(0)
+                continue
+
+            # convert numbers to words
+            text = re.sub(r&#34;(\d+)&#34;, lambda x: num2words(int(x.group(0))), text)  # type: ignore
+            # normalize text
+            text = self.nlp(text)  # type: ignore
+            # remove stopwords
+            if self.stopwords:
+                text = [w for w in text if not w.is_stop]  # type: ignore
+            # remove punctuation
+            text = [w for w in text if w.text not in self.PUNCTUATION]  # type: ignore
+            # lemmatize if needed
+            text = [getattr(t, &#34;lemma_&#34; if self.lemma else &#34;text&#34;) for t in text]  # type: ignore
+
+            texts[i] = &#34; &#34;.join(text)
+            lengths.append(len(text))
+            # convert to tensor
+            tokens = torch.Tensor([hash_trick(w, self.n_bins) for w in text])
+            output.append(tokens)
+
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
+        if return_text:
+            return padded_output, mask, texts  # type: ignore
+        return padded_output, mask
+
+
+class NoopTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+    strings, so &#34;Jeff Buckley&#34; will get it&#39;s own index. Whereas WhiteSpaceTokenizer will
+    split it to [&#34;Jeff&#34;, &#34;Buckley&#34;] and return an index per word.
+
+    For example:
+    [&#34;Queen&#34;, &#34;ABBA&#34;, &#34;Jeff Buckley&#34;] =&gt; [43, 55, 101]
+    [&#34;Metal&#34;, &#34;Rock&#34;, &#34;Classical&#34;] =&gt; [0, 223, 51]
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, pad_idx: int = 0):
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        output, lengths = [], []
+        for text in texts:
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(self.pad_idx)
+                lengths.append(0)
+            else:
+                output.append(hash_trick(text, self.n_bins))
+                lengths.append(1)
+
+        tokens = torch.LongTensor(output).unsqueeze(1)
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        return tokens, mask
+
+
+class BaseConditioner(nn.Module):
+    &#34;&#34;&#34;Base model for all conditioner modules.
+    We allow the output dim to be different than the hidden dim for two reasons:
+    1) keep our LUTs small when the vocab is large;
+    2) make all condition dims consistent.
+
+    Args:
+        dim (int): Hidden dim of the model.
+        output_dim (int): Output dim of the conditioner.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        self.output_proj = nn.Linear(dim, output_dim)
+
+    def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+        &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+        point, e.g. BPE tokenization with transfer to the GPU.
+
+        The returned value will be saved and return later when calling forward().
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: tp.Any) -&gt; ConditionType:
+        &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+        Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+        Returns:
+            ConditionType:
+                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+                  output embedding and D is the dimension of the embedding.
+                - And a mask indicating where the padding tokens.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class TextConditioner(BaseConditioner):
+    ...
+
+
+class LUTConditioner(TextConditioner):
+    &#34;&#34;&#34;Lookup table TextConditioner.
+
+    Args:
+        n_bins (int): Number of bins.
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+        tokenizer (str): Name of the tokenizer.
+        pad_idx (int, optional): Index for padding token. Defaults to 0.
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
+        super().__init__(dim, output_dim)
+        self.embed = nn.Embedding(n_bins, dim)
+        self.tokenizer: Tokenizer
+        if tokenizer == &#39;whitespace&#39;:
+            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
+        elif tokenizer == &#39;noop&#39;:
+            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
+        else:
+            raise ValueError(f&#34;unrecognized tokenizer `{tokenizer}`.&#34;)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        device = self.embed.weight.device
+        tokens, mask = self.tokenizer(x)
+        tokens, mask = tokens.to(device), mask.to(device)
+        return tokens, mask
+
+    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -&gt; ConditionType:
+        tokens, mask = inputs
+        embeds = self.embed(tokens)
+        embeds = self.output_proj(embeds)
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask
+
+
+class T5Conditioner(TextConditioner):
+    &#34;&#34;&#34;T5-based TextConditioner.
+
+    Args:
+        name (str): Name of the T5 model.
+        output_dim (int): Output dim of the conditioner.
+        finetune (bool): Whether to fine-tune T5 at train time.
+        device (str): Device for T5 Conditioner.
+        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
+        word_dropout (float, optional): Word dropout probability.
+        normalize_text (bool, optional): Whether to apply text normalization.
+    &#34;&#34;&#34;
+    MODELS = [&#34;t5-small&#34;, &#34;t5-base&#34;, &#34;t5-large&#34;, &#34;t5-3b&#34;, &#34;t5-11b&#34;,
+              &#34;google/flan-t5-small&#34;, &#34;google/flan-t5-base&#34;, &#34;google/flan-t5-large&#34;,
+              &#34;google/flan-t5-xl&#34;, &#34;google/flan-t5-xxl&#34;]
+    MODELS_DIMS = {
+        &#34;t5-small&#34;: 512,
+        &#34;t5-base&#34;: 768,
+        &#34;t5-large&#34;: 1024,
+        &#34;t5-3b&#34;: 1024,
+        &#34;t5-11b&#34;: 1024,
+        &#34;google/flan-t5-small&#34;: 512,
+        &#34;google/flan-t5-base&#34;: 768,
+        &#34;google/flan-t5-large&#34;: 1024,
+        &#34;google/flan-t5-3b&#34;: 1024,
+        &#34;google/flan-t5-11b&#34;: 1024,
+    }
+
+    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, word_dropout: float = 0.,
+                 normalize_text: bool = False):
+        assert name in self.MODELS, f&#34;Unrecognized t5 model name (should in {self.MODELS})&#34;
+        super().__init__(self.MODELS_DIMS[name], output_dim)
+        self.device = device
+        self.name = name
+        self.finetune = finetune
+        self.word_dropout = word_dropout
+        if autocast_dtype is None or self.device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            if self.device != &#39;cpu&#39;:
+                logger.warning(&#34;T5 has no autocast, this might lead to NaN&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;T5 will be evaluated with autocast as {autocast_dtype}&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # Let&#39;s disable logging temporarily because T5 will vomit some errors otherwise.
+        # thanks https://gist.github.com/simon-weber/7853144
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter(&#34;ignore&#34;)
+            try:
+                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
+            finally:
+                logging.disable(previous_level)
+        if finetune:
+            self.t5 = t5
+        else:
+            # this makes sure that the t5 models is not part
+            # of the saved checkpoint
+            self.__dict__[&#39;t5&#39;] = t5.to(device)
+
+        self.normalize_text = normalize_text
+        if normalize_text:
+            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Dict[str, torch.Tensor]:
+        # if current sample doesn&#39;t have a certain attribute, replace with empty string
+        entries: tp.List[str] = [xi if xi is not None else &#34;&#34; for xi in x]
+        if self.normalize_text:
+            _, _, entries = self.text_normalizer(entries, return_text=True)
+        if self.word_dropout &gt; 0. and self.training:
+            new_entries = []
+            for entry in entries:
+                words = [word for word in entry.split(&#34; &#34;) if random.random() &gt;= self.word_dropout]
+                new_entries.append(&#34; &#34;.join(words))
+            entries = new_entries
+
+        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == &#34;&#34;])
+
+        inputs = self.t5_tokenizer(entries, return_tensors=&#39;pt&#39;, padding=True).to(self.device)
+        mask = inputs[&#39;attention_mask&#39;]
+        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+        return inputs
+
+    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -&gt; ConditionType:
+        mask = inputs[&#39;attention_mask&#39;]
+        with torch.set_grad_enabled(self.finetune), self.autocast:
+            embeds = self.t5(**inputs).last_hidden_state
+        embeds = self.output_proj(embeds.to(self.output_proj.weight))
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask
+
+
+class WaveformConditioner(BaseConditioner):
+    &#34;&#34;&#34;Base class for all conditioners that take a waveform as input.
+    Classes that inherit must implement `_get_wav_embedding` that outputs
+    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
+    factor of the embedding model.
+
+    Args:
+        dim (int): The internal representation dimension.
+        output_dim (int): Output dimension.
+        device (tp.Union[torch.device, str]): Device.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
+        super().__init__(dim, output_dim)
+        self.device = device
+        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
+        self._use_masking = True
+
+    def tokenize(self, x: WavCondition) -&gt; WavCondition:
+        wav, length, sample_rate, path, seek_time = x
+        assert length is not None
+        return WavCondition(wav.to(self.device), length.to(self.device), sample_rate, path, seek_time)
+
+    def _get_wav_embedding(self, x: WavCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Gets as input a WavCondition and returns a dense embedding.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def _downsampling_factor(self):
+        &#34;&#34;&#34;Returns the downsampling factor of the embedding model.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, x: WavCondition) -&gt; ConditionType:
+        &#34;&#34;&#34;Extract condition embedding and mask from a waveform and its metadata.
+        Args:
+            x (WavCondition): Waveform condition containing raw waveform and metadata.
+        Returns:
+            ConditionType: a dense vector representing the conditioning along with its mask
+        &#34;&#34;&#34;
+        wav, lengths, *_ = x
+        with torch.no_grad():
+            embeds = self._get_wav_embedding(x)
+        embeds = embeds.to(self.output_proj.weight)
+        embeds = self.output_proj(embeds)
+
+        if lengths is not None and self._use_masking:
+            lengths = lengths / self._downsampling_factor()
+            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+        else:
+            mask = torch.ones_like(embeds[..., 0])
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask
+
+
+class ChromaStemConditioner(WaveformConditioner):
+    &#34;&#34;&#34;Chroma conditioner based on stems.
+    The ChromaStemConditioner uses DEMUCS to first filter out drums and bass, as
+    the drums and bass often dominate the chroma leading to the chroma features
+    not containing information about the melody.
+
+    Args:
+        output_dim (int): Output dimension for the conditioner.
+        sample_rate (int): Sample rate for the chroma extractor.
+        n_chroma (int): Number of chroma bins for the chroma extractor.
+        radix2_exp (int): Size of stft window for the chroma extractor (power of 2, e.g. 12 -&gt; 2^12).
+        duration (int): duration used during training. This is later used for correct padding
+            in case we are using chroma as prefix.
+        match_len_on_eval (bool, optional): if True then all chromas are padded to the training
+            duration. Defaults to False.
+        eval_wavs (str, optional): path to a dataset manifest with waveform, this waveforms are used as
+            conditions during eval (for cases where we don&#39;t want to leak test conditions like MusicCaps).
+            Defaults to None.
+        n_eval_wavs (int, optional): limits the number of waveforms used for conditioning. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for the conditioner.
+        **kwargs: Additional parameters for the chroma extractor.
+    &#34;&#34;&#34;
+    def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
+                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
+                 n_eval_wavs: int = 0, cache_path: tp.Optional[tp.Union[str, Path]] = None,
+                 device: tp.Union[torch.device, str] = &#39;cpu&#39;, **kwargs):
+        from demucs import pretrained
+        super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
+        self.autocast = TorchAutocast(enabled=device != &#39;cpu&#39;, device_type=self.device, dtype=torch.float32)
+        self.sample_rate = sample_rate
+        self.match_len_on_eval = match_len_on_eval
+        if match_len_on_eval:
+            self._use_masking = False
+        self.duration = duration
+        self.__dict__[&#39;demucs&#39;] = pretrained.get_model(&#39;htdemucs&#39;).to(device)
+        stem_sources: list = self.demucs.sources  # type: ignore
+        self.stem_indices = torch.LongTensor([stem_sources.index(&#39;vocals&#39;), stem_sources.index(&#39;other&#39;)]).to(device)
+        self.chroma = ChromaExtractor(sample_rate=sample_rate, n_chroma=n_chroma,
+                                      radix2_exp=radix2_exp, **kwargs).to(device)
+        self.chroma_len = self._get_chroma_len()
+        self.eval_wavs: tp.Optional[torch.Tensor] = self._load_eval_wavs(eval_wavs, n_eval_wavs)
+        self.cache = None
+        if cache_path is not None:
+            self.cache = EmbeddingCache(Path(cache_path) / &#39;wav&#39;, self.device,
+                                        compute_embed_fn=self._get_full_chroma_for_cache,
+                                        extract_embed_fn=self._extract_chroma_chunk)
+
+    def _downsampling_factor(self) -&gt; int:
+        return self.chroma.winhop
+
+    def _load_eval_wavs(self, path: tp.Optional[str], num_samples: int) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Load pre-defined waveforms from a json.
+        These waveforms will be used for chroma extraction during evaluation.
+        This is done to make the evaluation on MusicCaps fair (we shouldn&#39;t see the chromas of MusicCaps).
+        &#34;&#34;&#34;
+        if path is None:
+            return None
+
+        logger.info(f&#34;Loading evaluation wavs from {path}&#34;)
+        from audiocraft.data.audio_dataset import AudioDataset
+        dataset: AudioDataset = AudioDataset.from_meta(
+            path, segment_duration=self.duration, min_audio_duration=self.duration,
+            sample_rate=self.sample_rate, channels=1)
+
+        if len(dataset) &gt; 0:
+            eval_wavs = dataset.collater([dataset[i] for i in range(num_samples)]).to(self.device)
+            logger.info(f&#34;Using {len(eval_wavs)} evaluation wavs for chroma-stem conditioner&#34;)
+            return eval_wavs
+        else:
+            raise ValueError(&#34;Could not find evaluation wavs, check lengths of wavs&#34;)
+
+    def reset_eval_wavs(self, eval_wavs: tp.Optional[torch.Tensor]) -&gt; None:
+        self.eval_wavs = eval_wavs
+
+    def has_eval_wavs(self) -&gt; bool:
+        return self.eval_wavs is not None
+
+    def _sample_eval_wavs(self, num_samples: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample wavs from a predefined list.&#34;&#34;&#34;
+        assert self.eval_wavs is not None, &#34;Cannot sample eval wavs as no eval wavs provided.&#34;
+        total_eval_wavs = len(self.eval_wavs)
+        out = self.eval_wavs
+        if num_samples &gt; total_eval_wavs:
+            out = self.eval_wavs.repeat(num_samples // total_eval_wavs + 1, 1, 1)
+        return out[torch.randperm(len(out))][:num_samples]
+
+    def _get_chroma_len(self) -&gt; int:
+        &#34;&#34;&#34;Get length of chroma during training.&#34;&#34;&#34;
+        dummy_wav = torch.zeros((1, int(self.sample_rate * self.duration)), device=self.device)
+        dummy_chr = self.chroma(dummy_wav)
+        return dummy_chr.shape[1]
+
+    @torch.no_grad()
+    def _get_stemmed_wav(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get parts of the wav that holds the melody, extracting the main stems from the wav.&#34;&#34;&#34;
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+        with self.autocast:
+            wav = convert_audio(
+                wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
+            stems = apply_model(self.demucs, wav, device=self.device)
+            stems = stems[:, self.stem_indices]  # extract relevant stems for melody conditioning
+            mix_wav = stems.sum(1)  # merge extracted stems to single waveform
+            mix_wav = convert_audio(mix_wav, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
+            return mix_wav
+
+    @torch.no_grad()
+    def _extract_chroma(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract chroma features from the waveform.&#34;&#34;&#34;
+        with self.autocast:
+            return self.chroma(wav)
+
+    @torch.no_grad()
+    def _compute_wav_embedding(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute wav embedding, applying stem and chroma extraction.&#34;&#34;&#34;
+        # avoid 0-size tensors when we are working with null conds
+        if wav.shape[-1] == 1:
+            return self._extract_chroma(wav)
+        stems = self._get_stemmed_wav(wav, sample_rate)
+        chroma = self._extract_chroma(stems)
+        return chroma
+
+    @torch.no_grad()
+    def _get_full_chroma_for_cache(self, path: tp.Union[str, Path], x: WavCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract chroma from the whole audio waveform at the given path.&#34;&#34;&#34;
+        wav, sr = audio_read(path)
+        wav = wav[None].to(self.device)
+        wav = convert_audio(wav, sr, self.sample_rate, to_channels=1)
+        chroma = self._compute_wav_embedding(wav, self.sample_rate)[0]
+        return chroma
+
+    def _extract_chroma_chunk(self, full_chroma: torch.Tensor, x: WavCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract a chunk of chroma from the full chroma derived from the full waveform.&#34;&#34;&#34;
+        wav_length = x.wav.shape[-1]
+        seek_time = x.seek_time[idx]
+        assert seek_time is not None, (
+            &#34;WavCondition seek_time is required &#34;
+            &#34;when extracting chroma chunks from pre-computed chroma.&#34;)
+        full_chroma = full_chroma.float()
+        frame_rate = self.sample_rate / self._downsampling_factor()
+        target_length = int(frame_rate * wav_length / self.sample_rate)
+        index = int(frame_rate * seek_time)
+        out = full_chroma[index: index + target_length]
+        out = F.pad(out[None], (0, 0, 0, target_length - out.shape[0]))[0]
+        return out.to(self.device)
+
+    @torch.no_grad()
+    def _get_wav_embedding(self, x: WavCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get the wav embedding from the WavCondition.
+        The conditioner will either extract the embedding on-the-fly computing it from the condition wav directly
+        or will rely on the embedding cache to load the pre-computed embedding if relevant.
+        &#34;&#34;&#34;
+        sampled_wav: tp.Optional[torch.Tensor] = None
+        if not self.training and self.eval_wavs is not None:
+            warn_once(logger, &#34;Using precomputed evaluation wavs!&#34;)
+            sampled_wav = self._sample_eval_wavs(len(x.wav))
+
+        no_undefined_paths = all(p is not None for p in x.path)
+        no_nullified_cond = x.wav.shape[-1] &gt; 1
+        if sampled_wav is not None:
+            chroma = self._compute_wav_embedding(sampled_wav, self.sample_rate)
+        elif self.cache is not None and no_undefined_paths and no_nullified_cond:
+            paths = [Path(p) for p in x.path if p is not None]
+            chroma = self.cache.get_embed_from_cache(paths, x)
+        else:
+            assert all(sr == x.sample_rate[0] for sr in x.sample_rate), &#34;All sample rates in batch should be equal.&#34;
+            chroma = self._compute_wav_embedding(x.wav, x.sample_rate[0])
+
+        if self.match_len_on_eval:
+            B, T, C = chroma.shape
+            if T &gt; self.chroma_len:
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#34;Chroma was truncated to match length! ({T} -&gt; {chroma.shape[1]})&#34;)
+            elif T &lt; self.chroma_len:
+                n_repeat = int(math.ceil(self.chroma_len / T))
+                chroma = chroma.repeat(1, n_repeat, 1)
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#34;Chroma was repeated to match length! ({T} -&gt; {chroma.shape[1]})&#34;)
+
+        return chroma
+
+    def tokenize(self, x: WavCondition) -&gt; WavCondition:
+        &#34;&#34;&#34;Apply WavConditioner tokenization and populate cache if needed.&#34;&#34;&#34;
+        x = super().tokenize(x)
+        no_undefined_paths = all(p is not None for p in x.path)
+        if self.cache is not None and no_undefined_paths:
+            paths = [Path(p) for p in x.path if p is not None]
+            self.cache.populate_embed_cache(paths, x)
+        return x
+
+
+class JointEmbeddingConditioner(BaseConditioner):
+    &#34;&#34;&#34;Joint embedding conditioning supporting both audio or text conditioning.
+
+    Args:
+        dim (int): Dimension.
+        output_dim (int): Output dimension.
+        device (str): Device.
+        attribute (str): Attribute used by the conditioner.
+        autocast_dtype (str): Autocast for the conditioner.
+        quantize (bool): Whether to quantize the CLAP embedding.
+        n_q (int): Number of residual quantizers (used if quantize is true).
+        bins (int): Quantizers&#39; codebooks size (used if quantize is true).
+        kwargs: Additional parameters for residual vector quantizer.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, quantize: bool = True,
+                 n_q: int = 12, bins: int = 1024, **kwargs):
+        super().__init__(dim=dim, output_dim=output_dim)
+        self.device = device
+        self.attribute = attribute
+        if autocast_dtype is None or device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            logger.warning(&#34;JointEmbeddingConditioner has no autocast, this might lead to NaN.&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;JointEmbeddingConditioner will be evaluated with autocast as {autocast_dtype}.&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # residual vector quantizer to discretize the conditioned embedding
+        self.quantizer: tp.Optional[ResidualVectorQuantizer] = None
+        if quantize:
+            self.quantizer = ResidualVectorQuantizer(dim, n_q=n_q, bins=bins, **kwargs)
+
+    def _get_embed(self, x: JointEmbedCondition) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Get joint embedding in latent space from the inputs.
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: Tensor for the latent embedding
+                and corresponding empty indexes.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, x: JointEmbedCondition) -&gt; ConditionType:
+        with self.autocast:
+            embed, empty_idx = self._get_embed(x)
+            if self.quantizer is not None:
+                embed = embed.view(-1, self.dim, 1)
+                q_res = self.quantizer(embed, frame_rate=1)
+                out_embed = q_res.x.view(-1, self.dim)
+            else:
+                out_embed = embed
+            out_embed = self.output_proj(out_embed).view(-1, 1, self.output_dim)
+            mask = torch.ones(*out_embed.shape[:2], device=out_embed.device)
+            mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+            out_embed = (out_embed * mask.unsqueeze(-1))
+            return out_embed, mask
+
+    def tokenize(self, x: JointEmbedCondition) -&gt; JointEmbedCondition:
+        return x
+
+
+class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
+    &#34;&#34;&#34;Joint Embedding conditioner based on pre-trained CLAP model.
+
+    This CLAP-based conditioner supports a caching mechanism
+    over the computed embeddings for faster training.
+
+    Args:
+        dim (int): Dimension.
+        output_dim (int): Output dimension.
+        device (str): Device.
+        attribute (str): Attribute used by the conditioner.
+        quantize (bool): Whether to quantize the CLAP embedding.
+        n_q (int): Number of residual quantizers (used if quantize is true).
+        bins (int): Quantizers&#39; codebooks size (used if quantize is true).
+        checkpoint (str): Path to CLAP checkpoint.
+        model_arch (str): CLAP model architecture.
+        enable_fusion (bool): Enable fusion for CLAP model.
+        sample_rate (int): Sample rate used by CLAP model.
+        max_audio_length (float): Maximum audio length for CLAP model.
+        audio_stride (float): Stride to use for getting a CLAP embedding on the full sequence.
+        normalize (bool): Whether to normalize the CLAP embedding.
+        text_p (float): Probability of using text representation instead of audio at train time.
+        batch_size (Optional[int]): Batch size for CLAP embedding computation.
+        autocast_dtype (str): Autocast for the conditioner.
+        cache_path (Optional[str]): Path for pre-computed embeddings caching.
+        kwargs: Additional parameters for residual vector quantizer.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
+                 quantize: bool, n_q: int, bins: int, checkpoint: tp.Union[str, Path], model_arch: str,
+                 enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int,
+                 normalize: bool, text_p: bool, batch_size: tp.Optional[int] = None,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, cache_path: tp.Optional[str] = None, **kwargs):
+        try:
+            import laion_clap  # type: ignore
+        except ImportError:
+            raise ImportError(&#34;Please install CLAP to use the CLAPEmbeddingConditioner: &#39;pip install laion_clap&#39;&#34;)
+        warnings.warn(&#34;Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). &#34;
+                      &#34;Please retrain all models.&#34;)
+        checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
+        clap_tokenize = RobertaTokenizer.from_pretrained(&#39;roberta-base&#39;)
+        clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
+        load_clap_state_dict(clap_model, checkpoint)
+        clap_model.eval()
+        clap_model.to(device)
+        super().__init__(dim=dim, output_dim=output_dim, device=device, attribute=attribute,
+                         autocast_dtype=autocast_dtype, quantize=quantize, n_q=n_q, bins=bins,
+                         **kwargs)
+        self.checkpoint = checkpoint
+        self.enable_fusion = enable_fusion
+        self.model_arch = model_arch
+        self.clap: laion_clap.CLAP_Module
+        self.clap_tokenize: RobertaTokenizer
+        self.clap_sample_rate = sample_rate
+        self.clap_max_frames = int(self.clap_sample_rate * max_audio_length)
+        self.clap_stride = int(self.clap_sample_rate * audio_stride)
+        self.batch_size = batch_size or 1
+        self.normalize = normalize
+        self.text_p = text_p
+        self.__dict__[&#39;clap_tokenize&#39;] = clap_tokenize
+        self.__dict__[&#39;clap&#39;] = clap_model
+        self.wav_cache, self.text_cache = None, None
+        if cache_path is not None:
+            self.wav_cache = EmbeddingCache(Path(cache_path) / &#39;wav&#39;, self.device,
+                                            compute_embed_fn=self._get_wav_embedding_for_cache,
+                                            extract_embed_fn=self._extract_wav_embedding_chunk)
+            self.text_cache = EmbeddingCache(Path(cache_path) / &#39;text&#39;, self.device,
+                                             compute_embed_fn=self._get_text_embedding_for_cache)
+
+    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -&gt; dict:
+        # we use the default params from CLAP module here as well
+        return self.clap_tokenize(texts, padding=&#34;max_length&#34;, truncation=True, max_length=77, return_tensors=&#34;pt&#34;)
+
+    def _compute_text_embedding(self, text: tp.List[str]) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute text embedding from CLAP model on a given a batch of text.
+
+        Args:
+            text (list[str]): List of text for the batch, with B items.
+        Returns:
+            torch.Tensor: CLAP embedding derived from text, of shape [B, 1, D], with D the CLAP embedding dimension.
+        &#34;&#34;&#34;
+        with torch.no_grad():
+            embed = self.clap.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+            return embed.view(embed.size(0), 1, embed.size(-1))
+
+    def _get_text_embedding_for_cache(self, path: tp.Union[Path, str],
+                                      x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get text embedding function for the cache.&#34;&#34;&#34;
+        text = x.text[idx]
+        text = text if text is not None else &#34;&#34;
+        return self._compute_text_embedding([text])[0]
+
+    def _preprocess_wav(self, wav: torch.Tensor, length: torch.Tensor, sample_rates: tp.List[int]) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Preprocess wav to expected format by CLAP model.
+
+        Args:
+            wav (torch.Tensor): Audio wav, of shape [B, C, T].
+            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
+            sample_rates (list[int]): Sample rates for each sample in the batch
+        Returns:
+            torch.Tensor: Audio wav of shape [B, T].
+        &#34;&#34;&#34;
+        assert wav.dim() == 3, &#34;Expecting wav to be [B, C, T]&#34;
+        if sample_rates is not None:
+            _wav = []
+            for i, audio in enumerate(wav):
+                sr = sample_rates[i]
+                audio = convert_audio(audio, from_rate=sr, to_rate=self.clap_sample_rate, to_channels=1)
+                _wav.append(audio)
+            wav = torch.stack(_wav, dim=0)
+        wav = wav.mean(dim=1)
+        return wav
+
+    def _compute_wav_embedding(self, wav: torch.Tensor, length: torch.Tensor,
+                               sample_rates: tp.List[int], reduce_mean: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute audio wave embedding from CLAP model.
+
+        Since CLAP operates on a fixed sequence length audio inputs and we need to process longer audio sequences,
+        we calculate the wav embeddings on `clap_max_frames` windows with `clap_stride`-second stride and
+        average the resulting embeddings.
+
+        Args:
+            wav (torch.Tensor): Audio wav, of shape [B, C, T].
+            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
+            sample_rates (list[int]): Sample rates for each sample in the batch.
+            reduce_mean (bool): Whether to get the average tensor.
+        Returns:
+            torch.Tensor: Audio embedding of shape [B, F, D], F being the number of chunks, D the dimension.
+        &#34;&#34;&#34;
+        with torch.no_grad():
+            wav = self._preprocess_wav(wav, length, sample_rates)
+            B, T = wav.shape
+            if T &gt;= self.clap_max_frames:
+                wav = wav.unfold(-1, self.clap_max_frames, self.clap_stride)  # [B, F, T]
+            else:
+                wav = wav.view(-1, 1, T)  # [B, F, T] with F=1
+            wav = einops.rearrange(wav, &#39;b f t -&gt; (b f) t&#39;)
+            embed_list = []
+            for i in range(0, wav.size(0), self.batch_size):
+                _wav = wav[i:i+self.batch_size, ...]
+                _embed = self.clap.get_audio_embedding_from_data(_wav, use_tensor=True)
+                embed_list.append(_embed)
+            embed = torch.cat(embed_list, dim=0)
+            embed = einops.rearrange(embed, &#39;(b f) d -&gt; b f d&#39;, b=B)
+            if reduce_mean:
+                embed = embed.mean(dim=1, keepdim=True)
+            return embed  # [B, F, D] with F=1 if reduce_mean is True
+
+    def _get_wav_embedding_for_cache(self, path: tp.Union[str, Path],
+                                     x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute audio wave embedding for the cache.
+        The embedding is computed on a given audio read from file.
+
+        Args:
+            path (str or Path): Path to the full audio file.
+        Returns:
+            torch.Tensor: Single-item tensor of shape [F, D], F being the number of chunks, D the dimension.
+        &#34;&#34;&#34;
+        wav, sr = audio_read(path)  # [C, T]
+        wav = wav.unsqueeze(0).to(self.device)  # [1, C, T]
+        wav_len = torch.LongTensor([wav.shape[-1]]).to(self.device)
+        embed = self._compute_wav_embedding(wav, wav_len, [sr], reduce_mean=False)  # [B, F, D]
+        return embed.squeeze(0)  # [F, D]
+
+    def _extract_wav_embedding_chunk(self, full_embed: torch.Tensor, x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract the chunk of embedding matching the seek_time and length from the full CLAP audio embedding.
+
+        Args:
+            full_embed (torch.Tensor): CLAP embedding computed on the full wave, of shape [F, D].
+            x (JointEmbedCondition): Joint embedding condition for the full batch.
+            idx (int): Index considered for the given embedding to extract.
+        Returns:
+            torch.Tensor: Wav embedding averaged on sliding window, of shape [1, D].
+        &#34;&#34;&#34;
+        sample_rate = x.sample_rate[idx]
+        seek_time = x.seek_time[idx]
+        seek_time = 0. if seek_time is None else seek_time
+        clap_stride = int(self.clap_stride / self.clap_sample_rate) * sample_rate
+        end_seek_time = seek_time + self.clap_max_frames / self.clap_sample_rate
+        start_offset = int(seek_time * sample_rate // clap_stride)
+        end_offset = int(end_seek_time * sample_rate // clap_stride)
+        wav_embed = full_embed[start_offset:end_offset, ...]
+        wav_embed = wav_embed.mean(dim=0, keepdim=True)
+        return wav_embed.to(self.device)  # [F, D]
+
+    def _get_text_embedding(self, x: JointEmbedCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get CLAP embedding from a batch of text descriptions.&#34;&#34;&#34;
+        no_nullified_cond = x.wav.shape[-1] &gt; 1  # we don&#39;t want to read from cache when condition dropout
+        if self.text_cache is not None and no_nullified_cond:
+            assert all(p is not None for p in x.path), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            embed = self.text_cache.get_embed_from_cache(paths, x)
+        else:
+            text = [xi if xi is not None else &#34;&#34; for xi in x.text]
+            embed = self._compute_text_embedding(text)
+        if self.normalize:
+            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
+        return embed
+
+    def _get_wav_embedding(self, x: JointEmbedCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get CLAP embedding from a batch of audio tensors (and corresponding sample rates).&#34;&#34;&#34;
+        no_undefined_paths = all(p is not None for p in x.path)
+        no_nullified_cond = x.wav.shape[-1] &gt; 1  # we don&#39;t want to read from cache when condition dropout
+        if self.wav_cache is not None and no_undefined_paths and no_nullified_cond:
+            paths = [Path(p) for p in x.path if p is not None]
+            embed = self.wav_cache.get_embed_from_cache(paths, x)
+        else:
+            embed = self._compute_wav_embedding(x.wav, x.length, x.sample_rate, reduce_mean=True)
+        if self.normalize:
+            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
+        return embed
+
+    def tokenize(self, x: JointEmbedCondition) -&gt; JointEmbedCondition:
+        # Trying to limit as much as possible sync points when the cache is warm.
+        no_undefined_paths = all(p is not None for p in x.path)
+        if self.wav_cache is not None and no_undefined_paths:
+            assert all([p is not None for p in x.path]), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            self.wav_cache.populate_embed_cache(paths, x)
+        if self.text_cache is not None and no_undefined_paths:
+            assert all([p is not None for p in x.path]), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            self.text_cache.populate_embed_cache(paths, x)
+        return x
+
+    def _get_embed(self, x: JointEmbedCondition) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Extract shared latent representation from either the wav or the text using CLAP.&#34;&#34;&#34;
+        # decide whether to use text embedding at train time or not
+        use_text_embed = random.random() &lt; self.text_p
+        if self.training and not use_text_embed:
+            embed = self._get_wav_embedding(x)
+            empty_idx = torch.LongTensor([])  # we assume we always have the audio wav
+        else:
+            embed = self._get_text_embedding(x)
+            empty_idx = torch.LongTensor([i for i, xi in enumerate(x.text) if xi is None or xi == &#34;&#34;])
+        return embed, empty_idx
+
+
+def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -&gt; ConditioningAttributes:
+    &#34;&#34;&#34;Utility function for nullifying an attribute inside an ConditioningAttributes object.
+    If the condition is of type &#34;wav&#34;, then nullify it using `nullify_condition` function.
+    If the condition is of any other type, set its value to None.
+    Works in-place.
+    &#34;&#34;&#34;
+    if condition_type not in [&#39;text&#39;, &#39;wav&#39;, &#39;joint_embed&#39;]:
+        raise ValueError(
+            &#34;dropout_condition got an unexpected condition type!&#34;
+            f&#34; expected &#39;text&#39;, &#39;wav&#39; or &#39;joint_embed&#39; but got &#39;{condition_type}&#39;&#34;
+        )
+
+    if condition not in getattr(sample, condition_type):
+        raise ValueError(
+            &#34;dropout_condition received an unexpected condition!&#34;
+            f&#34; expected wav={sample.wav.keys()} and text={sample.text.keys()}&#34;
+            f&#34; but got &#39;{condition}&#39; of type &#39;{condition_type}&#39;!&#34;
+        )
+
+    if condition_type == &#39;wav&#39;:
+        wav_cond = sample.wav[condition]
+        sample.wav[condition] = nullify_wav(wav_cond)
+    elif condition_type == &#39;joint_embed&#39;:
+        embed = sample.joint_embed[condition]
+        sample.joint_embed[condition] = nullify_joint_embed(embed)
+    else:
+        sample.text[condition] = None
+
+    return sample
+
+
+class DropoutModule(nn.Module):
+    &#34;&#34;&#34;Base module for all dropout modules.&#34;&#34;&#34;
+    def __init__(self, seed: int = 1234):
+        super().__init__()
+        self.rng = torch.Generator()
+        self.rng.manual_seed(seed)
+
+
+class AttributeDropout(DropoutModule):
+    &#34;&#34;&#34;Dropout with a given probability per attribute.
+    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
+    to be dropped out separately. For example, &#34;artist&#34; can be dropped while &#34;genre&#34; remains.
+    This is in contrast to ClassifierFreeGuidanceDropout where if &#34;artist&#34; is dropped &#34;genre&#34;
+    must also be dropped.
+
+    Args:
+        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
+            ...
+            &#34;genre&#34;: 0.1,
+            &#34;artist&#34;: 0.5,
+            &#34;wav&#34;: 0.25,
+            ...
+        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
+        seed (int, optional): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.active_on_eval = active_on_eval
+        # construct dict that return the values from p otherwise 0
+        self.p = {}
+        for condition_type, probs in p.items():
+            self.p[condition_type] = defaultdict(lambda: 0, probs)
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training and not self.active_on_eval:
+            return samples
+
+        samples = deepcopy(samples)
+        for condition_type, ps in self.p.items():  # for condition types [text, wav]
+            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+                if torch.rand(1, generator=self.rng).item() &lt; p:
+                    for sample in samples:
+                        dropout_condition(sample, condition_type, condition)
+        return samples
+
+    def __repr__(self):
+        return f&#34;AttributeDropout({dict(self.p)})&#34;
+
+
+class ClassifierFreeGuidanceDropout(DropoutModule):
+    &#34;&#34;&#34;Classifier Free Guidance dropout.
+    All attributes are dropped with the same probability.
+
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: float, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.p = p
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training:
+            return samples
+
+        # decide on which attributes to drop in a batched fashion
+        drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+        if not drop:
+            return samples
+
+        # nullify conditions of all attributes
+        samples = deepcopy(samples)
+        for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+            for sample in samples:
+                for condition in sample.attributes[condition_type]:
+                    dropout_condition(sample, condition_type, condition)
+        return samples
+
+    def __repr__(self):
+        return f&#34;ClassifierFreeGuidanceDropout(p={self.p})&#34;
+
+
+class ConditioningProvider(nn.Module):
+    &#34;&#34;&#34;Prepare and provide conditions given all the supported conditioners.
+
+    Args:
+        conditioners (dict): Dictionary of conditioners.
+        device (torch.device or str, optional): Device for conditioners and output condition types.
+    &#34;&#34;&#34;
+    def __init__(self, conditioners: tp.Dict[str, BaseConditioner], device: tp.Union[torch.device, str] = &#34;cpu&#34;):
+        super().__init__()
+        self.device = device
+        self.conditioners = nn.ModuleDict(conditioners)
+
+    @property
+    def joint_embed_conditions(self):
+        return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
+
+    @property
+    def has_joint_embed_conditions(self):
+        return len(self.joint_embed_conditions) &gt; 0
+
+    @property
+    def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
+
+    @property
+    def wav_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
+
+    @property
+    def has_wav_condition(self):
+        return len(self.wav_conditions) &gt; 0
+
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+        &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+        Args:
+            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
+                text and wav conditions.
+        &#34;&#34;&#34;
+        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
+            &#34;Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34;,
+            f&#34; but types were {set([type(x) for x in inputs])}&#34;
+        )
+
+        output = {}
+        text = self._collate_text(inputs)
+        wavs = self._collate_wavs(inputs)
+        joint_embeds = self._collate_joint_embeds(inputs)
+
+        assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
+            f&#34;Got an unexpected attribute! Expected {self.conditioners.keys()}, &#34;,
+            f&#34;got {text.keys(), wavs.keys(), joint_embeds.keys()}&#34;
+        )
+
+        for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+
+    def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+        &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
+        The output is for example:
+        {
+            &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+            &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+            ...
+        }
+
+        Args:
+            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+        &#34;&#34;&#34;
+        output = {}
+        for attribute, inputs in tokenized.items():
+            condition, mask = self.conditioners[attribute](inputs)
+            output[attribute] = (condition, mask)
+        return output
+
+    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.List[tp.Optional[str]]]:
+        &#34;&#34;&#34;Given a list of ConditioningAttributes objects, compile a dictionary where the keys
+        are the attributes and the values are the aggregated input per attribute.
+        For example:
+        Input:
+        [
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Rock&#34;, &#34;description&#34;: &#34;A rock song with a guitar solo&#34;}, wav=...),
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Hip-hop&#34;, &#34;description&#34;: &#34;A hip-hop verse&#34;}, wav=...),
+        ]
+        Output:
+        {
+            &#34;genre&#34;: [&#34;Rock&#34;, &#34;Hip-hop&#34;],
+            &#34;description&#34;: [&#34;A rock song with a guitar solo&#34;, &#34;A hip-hop verse&#34;]
+        }
+
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
+        &#34;&#34;&#34;
+        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
+        texts = [x.text for x in samples]
+        for text in texts:
+            for condition in self.text_conditions:
+                out[condition].append(text[condition])
+        return out
+
+    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, WavCondition]:
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we fetch similar wavs,
+        and the values are Tensors of wavs according to said attributes.
+
+        *Note*: by the time the samples reach this function, each sample should have some waveform
+        inside the &#34;wav&#34; attribute. It should be either:
+        1. A real waveform
+        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
+        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
+
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
+        &#34;&#34;&#34;
+        wavs = defaultdict(list)
+        lengths = defaultdict(list)
+        sample_rates = defaultdict(list)
+        paths = defaultdict(list)
+        seek_times = defaultdict(list)
+        out: tp.Dict[str, WavCondition] = {}
+
+        for sample in samples:
+            for attribute in self.wav_conditions:
+                wav, length, sample_rate, path, seek_time = sample.wav[attribute]
+                assert wav.dim() == 3, f&#34;Got wav with dim={wav.dim()}, but expected 3 [1, C, T]&#34;
+                assert wav.size(0) == 1, f&#34;Got wav [B, C, T] with shape={wav.shape}, but expected B == 1&#34;
+                # mono-channel conditioning
+                wav = wav.mean(1, keepdim=True)  # [1, 1, T]
+                wavs[attribute].append(wav.flatten())  # [T]
+                lengths[attribute].append(length)
+                sample_rates[attribute].extend(sample_rate)
+                paths[attribute].extend(path)
+                seek_times[attribute].extend(seek_time)
+
+        # stack all wavs to a single tensor
+        for attribute in self.wav_conditions:
+            stacked_wav, _ = collate(wavs[attribute], dim=0)
+            out[attribute] = WavCondition(
+                stacked_wav.unsqueeze(1), torch.cat(lengths[attribute]), sample_rates[attribute],
+                paths[attribute], seek_times[attribute])
+
+        return out
+
+    def _collate_joint_embeds(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, JointEmbedCondition]:
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we compute joint embeddings,
+        and the values are Tensors of pre-computed embeddings and the corresponding text attributes.
+
+        Args:
+            samples (list[ConditioningAttributes]): List of ConditioningAttributes samples.
+        Returns:
+            A dictionary mapping an attribute name to joint embeddings.
+        &#34;&#34;&#34;
+        texts = defaultdict(list)
+        wavs = defaultdict(list)
+        lengths = defaultdict(list)
+        sample_rates = defaultdict(list)
+        paths = defaultdict(list)
+        seek_times = defaultdict(list)
+        channels: int = 0
+
+        out = {}
+        for sample in samples:
+            for attribute in self.joint_embed_conditions:
+                wav, text, length, sample_rate, path, seek_time = sample.joint_embed[attribute]
+                assert wav.dim() == 3
+                if channels == 0:
+                    channels = wav.size(1)
+                else:
+                    assert channels == wav.size(1), &#34;not all audio has same number of channels in batch&#34;
+                assert wav.size(0) == 1, &#34;Expecting single-wav batch in the collate method&#34;
+                wav = einops.rearrange(wav, &#34;b c t -&gt; (b c t)&#34;)  # [1, C, T] =&gt; [C * T]
+                wavs[attribute].append(wav)
+                texts[attribute].extend(text)
+                lengths[attribute].append(length)
+                sample_rates[attribute].extend(sample_rate)
+                paths[attribute].extend(path)
+                seek_times[attribute].extend(seek_time)
+
+        for attribute in self.joint_embed_conditions:
+            stacked_texts = texts[attribute]
+            stacked_paths = paths[attribute]
+            stacked_seek_times = seek_times[attribute]
+            stacked_wavs = pad_sequence(wavs[attribute]).to(self.device)
+            stacked_wavs = einops.rearrange(stacked_wavs, &#34;(c t) b -&gt; b c t&#34;, c=channels)
+            stacked_sample_rates = sample_rates[attribute]
+            stacked_lengths = torch.cat(lengths[attribute]).to(self.device)
+            assert stacked_lengths.size(0) == stacked_wavs.size(0)
+            assert len(stacked_sample_rates) == stacked_wavs.size(0)
+            assert len(stacked_texts) == stacked_wavs.size(0)
+            out[attribute] = JointEmbedCondition(
+                text=stacked_texts, wav=stacked_wavs,
+                length=stacked_lengths, sample_rate=stacked_sample_rates,
+                path=stacked_paths, seek_time=stacked_seek_times)
+
+        return out
+
+
+class ConditionFuser(StreamingModule):
+    &#34;&#34;&#34;Condition fuser handles the logic to combine the different conditions
+    to the actual model input.
+
+    Args:
+        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
+            each condition. For example:
+            {
+                &#34;prepend&#34;: [&#34;description&#34;],
+                &#34;sum&#34;: [&#34;genre&#34;, &#34;bpm&#34;],
+                &#34;cross&#34;: [&#34;description&#34;],
+            }
+        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
+        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
+    &#34;&#34;&#34;
+    FUSING_METHODS = [&#34;sum&#34;, &#34;prepend&#34;, &#34;cross&#34;, &#34;input_interpolate&#34;]
+
+    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
+                 cross_attention_pos_emb_scale: float = 1.0):
+        super().__init__()
+        assert all(
+            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
+        ), f&#34;Got invalid fuse method, allowed methods: {self.FUSING_METHODS}&#34;
+        self.cross_attention_pos_emb = cross_attention_pos_emb
+        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
+        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
+        self.cond2fuse: tp.Dict[str, str] = {}
+        for fuse_method, conditions in fuse2cond.items():
+            for condition in conditions:
+                self.cond2fuse[condition] = fuse_method
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        conditions: tp.Dict[str, ConditionType]
+    ) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+        Args:
+            input (torch.Tensor): Transformer input.
+            conditions (dict[str, ConditionType]): Dict of conditions.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
+                after the conditions have been fused. The second output tensor is the tensor
+                used for cross-attention or None if no cross attention inputs exist.
+        &#34;&#34;&#34;
+        B, T, _ = input.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            first_step = False
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            first_step = True
+            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+            f&#34;given conditions contain unknown attributes for fuser, &#34; \
+            f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+        cross_attention_output = None
+        for cond_type, (cond, cond_mask) in conditions.items():
+            op = self.cond2fuse[cond_type]
+            if op == &#39;sum&#39;:
+                input += cond
+            elif op == &#39;input_interpolate&#39;:
+                cond = einops.rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+                cond = F.interpolate(cond, size=input.shape[1])
+                input += einops.rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+            elif op == &#39;prepend&#39;:
+                if first_step:
+                    input = torch.cat([cond, input], dim=1)
+            elif op == &#39;cross&#39;:
+                if cross_attention_output is not None:
+                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+                else:
+                    cross_attention_output = cond
+            else:
+                raise ValueError(f&#34;unknown op ({op})&#34;)
+
+        if self.cross_attention_pos_emb and cross_attention_output is not None:
+            positions = torch.arange(
+                cross_attention_output.shape[1],
+                device=cross_attention_output.device
+            ).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return input, cross_attention_output</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.conditioners.dropout_condition"><code class="name flex">
+<span>def <span class="ident">dropout_condition</span></span>(<span>sample: <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>, condition_type: str, condition: str) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Utility function for nullifying an attribute inside an ConditioningAttributes object.
+If the condition is of type "wav", then nullify it using <code><a title="audiocraft.modules.conditioners.nullify_condition" href="#audiocraft.modules.conditioners.nullify_condition">nullify_condition()</a></code> function.
+If the condition is of any other type, set its value to None.
+Works in-place.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -&gt; ConditioningAttributes:
+    &#34;&#34;&#34;Utility function for nullifying an attribute inside an ConditioningAttributes object.
+    If the condition is of type &#34;wav&#34;, then nullify it using `nullify_condition` function.
+    If the condition is of any other type, set its value to None.
+    Works in-place.
+    &#34;&#34;&#34;
+    if condition_type not in [&#39;text&#39;, &#39;wav&#39;, &#39;joint_embed&#39;]:
+        raise ValueError(
+            &#34;dropout_condition got an unexpected condition type!&#34;
+            f&#34; expected &#39;text&#39;, &#39;wav&#39; or &#39;joint_embed&#39; but got &#39;{condition_type}&#39;&#34;
+        )
+
+    if condition not in getattr(sample, condition_type):
+        raise ValueError(
+            &#34;dropout_condition received an unexpected condition!&#34;
+            f&#34; expected wav={sample.wav.keys()} and text={sample.text.keys()}&#34;
+            f&#34; but got &#39;{condition}&#39; of type &#39;{condition_type}&#39;!&#34;
+        )
+
+    if condition_type == &#39;wav&#39;:
+        wav_cond = sample.wav[condition]
+        sample.wav[condition] = nullify_wav(wav_cond)
+    elif condition_type == &#39;joint_embed&#39;:
+        embed = sample.joint_embed[condition]
+        sample.joint_embed[condition] = nullify_joint_embed(embed)
+    else:
+        sample.text[condition] = None
+
+    return sample</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.nullify_condition"><code class="name flex">
+<span>def <span class="ident">nullify_condition</span></span>(<span>condition: Tuple[torch.Tensor, torch.Tensor], dim: int = 1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transform an input condition to a null condition.
+The way it is done by converting it to a single zero vector similarly
+to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>condition</code></strong> :&ensp;<code>ConditionType</code></dt>
+<dd>A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>The dimension that will be truncated (should be the time dimension)</dd>
+</dl>
+<p>WARNING!: dim should not be the batch dimension!</p>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>ConditionType</code></dt>
+<dd>A tuple of null condition and mask</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def nullify_condition(condition: ConditionType, dim: int = 1):
+    &#34;&#34;&#34;Transform an input condition to a null condition.
+    The way it is done by converting it to a single zero vector similarly
+    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
+
+    Args:
+        condition (ConditionType): A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])
+        dim (int): The dimension that will be truncated (should be the time dimension)
+        WARNING!: dim should not be the batch dimension!
+    Returns:
+        ConditionType: A tuple of null condition and mask
+    &#34;&#34;&#34;
+    assert dim != 0, &#34;dim cannot be the batch dimension!&#34;
+    assert isinstance(condition, tuple) and \
+        isinstance(condition[0], torch.Tensor) and \
+        isinstance(condition[1], torch.Tensor), &#34;&#39;nullify_condition&#39; got an unexpected input type!&#34;
+    cond, mask = condition
+    B = cond.shape[0]
+    last_dim = cond.dim() - 1
+    out = cond.transpose(dim, last_dim)
+    out = 0. * out[..., :1]
+    out = out.transpose(dim, last_dim)
+    mask = torch.zeros((B, 1), device=out.device).int()
+    assert cond.dim() == out.dim()
+    return out, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.nullify_joint_embed"><code class="name flex">
+<span>def <span class="ident">nullify_joint_embed</span></span>(<span>embed: <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a>) ‑> <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
+and replacing metadata by dummy attributes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cond</code></strong> :&ensp;<code><a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a></code></dt>
+<dd>Joint embedding condition with wav and text, wav tensor of shape [B, C, T].</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def nullify_joint_embed(embed: JointEmbedCondition) -&gt; JointEmbedCondition:
+    &#34;&#34;&#34;Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
+    and replacing metadata by dummy attributes.
+
+    Args:
+        cond (JointEmbedCondition): Joint embedding condition with wav and text, wav tensor of shape [B, C, T].
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((embed.wav, torch.zeros_like(embed.wav)), dim=embed.wav.dim() - 1)
+    return JointEmbedCondition(
+        wav=null_wav, text=[None] * len(embed.text),
+        length=torch.LongTensor([0]).to(embed.wav.device),
+        sample_rate=embed.sample_rate,
+        path=[None] * embed.wav.shape[0],
+        seek_time=[0] * embed.wav.shape[0],
+    )</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.nullify_wav"><code class="name flex">
+<span>def <span class="ident">nullify_wav</span></span>(<span>cond: <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>) ‑> <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transform a WavCondition to a nullified WavCondition.
+It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cond</code></strong> :&ensp;<code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></dt>
+<dd>Wav condition with wav, tensor of shape [B, T].</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></dt>
+<dd>Nullified wav condition.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def nullify_wav(cond: WavCondition) -&gt; WavCondition:
+    &#34;&#34;&#34;Transform a WavCondition to a nullified WavCondition.
+    It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.
+
+    Args:
+        cond (WavCondition): Wav condition with wav, tensor of shape [B, T].
+    Returns:
+        WavCondition: Nullified wav condition.
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((cond.wav, torch.zeros_like(cond.wav)), dim=cond.wav.dim() - 1)
+    return WavCondition(
+        wav=null_wav,
+        length=torch.tensor([0] * cond.wav.shape[0], device=cond.wav.device),
+        sample_rate=cond.sample_rate,
+        path=[None] * cond.wav.shape[0],
+        seek_time=[None] * cond.wav.shape[0],
+    )</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout"><code class="flex name class">
+<span>class <span class="ident">AttributeDropout</span></span>
+<span>(</span><span>p: Dict[str, Dict[str, float]], active_on_eval: bool = False, seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Dropout with a given probability per attribute.
+This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
+to be dropped out separately. For example, "artist" can be dropped while "genre" remains.
+This is in contrast to ClassifierFreeGuidanceDropout where if "artist" is dropped "genre"
+must also be dropped.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>p</code></strong> :&ensp;<code>tp.Dict[str, float]</code></dt>
+<dd>A dict mapping between attributes and dropout probability. For example:
+&hellip;
+"genre": 0.1,
+"artist": 0.5,
+"wav": 0.25,
+&hellip;</dd>
+<dt><strong><code>active_on_eval</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether the dropout is active at eval. Default to False.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Random seed.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AttributeDropout(DropoutModule):
+    &#34;&#34;&#34;Dropout with a given probability per attribute.
+    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
+    to be dropped out separately. For example, &#34;artist&#34; can be dropped while &#34;genre&#34; remains.
+    This is in contrast to ClassifierFreeGuidanceDropout where if &#34;artist&#34; is dropped &#34;genre&#34;
+    must also be dropped.
+
+    Args:
+        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
+            ...
+            &#34;genre&#34;: 0.1,
+            &#34;artist&#34;: 0.5,
+            &#34;wav&#34;: 0.25,
+            ...
+        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
+        seed (int, optional): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.active_on_eval = active_on_eval
+        # construct dict that return the values from p otherwise 0
+        self.p = {}
+        for condition_type, probs in p.items():
+            self.p[condition_type] = defaultdict(lambda: 0, probs)
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training and not self.active_on_eval:
+            return samples
+
+        samples = deepcopy(samples)
+        for condition_type, ps in self.p.items():  # for condition types [text, wav]
+            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+                if torch.rand(1, generator=self.rng).item() &lt; p:
+                    for sample in samples:
+                        dropout_condition(sample, condition_type, condition)
+        return samples
+
+    def __repr__(self):
+        return f&#34;AttributeDropout({dict(self.p)})&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, samples: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>list[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>list[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions after certain attributes were set to None.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+    &#34;&#34;&#34;
+    Args:
+        samples (list[ConditioningAttributes]): List of conditions.
+    Returns:
+        list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+    &#34;&#34;&#34;
+    if not self.training and not self.active_on_eval:
+        return samples
+
+    samples = deepcopy(samples)
+    for condition_type, ps in self.p.items():  # for condition types [text, wav]
+        for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+            if torch.rand(1, generator=self.rng).item() &lt; p:
+                for sample in samples:
+                    dropout_condition(sample, condition_type, condition)
+    return samples</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner"><code class="flex name class">
+<span>class <span class="ident">BaseConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base model for all conditioner modules.
+We allow the output dim to be different than the hidden dim for two reasons:
+1) keep our LUTs small when the vocab is large;
+2) make all condition dims consistent.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseConditioner(nn.Module):
+    &#34;&#34;&#34;Base model for all conditioner modules.
+    We allow the output dim to be different than the hidden dim for two reasons:
+    1) keep our LUTs small when the vocab is large;
+    2) make all condition dims consistent.
+
+    Args:
+        dim (int): Hidden dim of the model.
+        output_dim (int): Output dim of the conditioner.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        self.output_proj = nn.Linear(dim, output_dim)
+
+    def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+        &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+        point, e.g. BPE tokenization with transfer to the GPU.
+
+        The returned value will be saved and return later when calling forward().
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: tp.Any) -&gt; ConditionType:
+        &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+        Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+        Returns:
+            ConditionType:
+                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+                  output embedding and D is the dimension of the embedding.
+                - And a mask indicating where the padding tokens.
+        &#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner">JointEmbeddingConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, inputs: Any) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+Outputs a ConditionType, after the input data was embedded as a dense vector.</p>
+<h2 id="returns">Returns</h2>
+<p>ConditionType:
+- A tensor of size [B, T, D] where B is the batch size, T is the length of the
+output embedding and D is the dimension of the embedding.
+- And a mask indicating where the padding tokens.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, inputs: tp.Any) -&gt; ConditionType:
+    &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+    Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+    Returns:
+        ConditionType:
+            - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+              output embedding and D is the dimension of the embedding.
+            - And a mask indicating where the padding tokens.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.tokenize"><code class="name flex">
+<span>def <span class="ident">tokenize</span></span>(<span>self, *args, **kwargs) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Should be any part of the processing that will lead to a synchronization
+point, e.g. BPE tokenization with transfer to the GPU.</p>
+<p>The returned value will be saved and return later when calling forward().</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+    &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+    point, e.g. BPE tokenization with transfer to the GPU.
+
+    The returned value will be saved and return later when calling forward().
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.CLAPEmbeddingConditioner"><code class="flex name class">
+<span>class <span class="ident">CLAPEmbeddingConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int, device: str, attribute: str, quantize: bool, n_q: int, bins: int, checkpoint: Union[str, pathlib.Path], model_arch: str, enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int, normalize: bool, text_p: bool, batch_size: Optional[int] = None, autocast_dtype: Optional[str] = 'float32', cache_path: Optional[str] = None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Joint Embedding conditioner based on pre-trained CLAP model.</p>
+<p>This CLAP-based conditioner supports a caching mechanism
+over the computed embeddings for faster training.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code></dt>
+<dd>Device.</dd>
+<dt><strong><code>attribute</code></strong> :&ensp;<code>str</code></dt>
+<dd>Attribute used by the conditioner.</dd>
+<dt><strong><code>quantize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to quantize the CLAP embedding.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of residual quantizers (used if quantize is true).</dd>
+<dt><strong><code>bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Quantizers' codebooks size (used if quantize is true).</dd>
+<dt><strong><code>checkpoint</code></strong> :&ensp;<code>str</code></dt>
+<dd>Path to CLAP checkpoint.</dd>
+<dt><strong><code>model_arch</code></strong> :&ensp;<code>str</code></dt>
+<dd>CLAP model architecture.</dd>
+<dt><strong><code>enable_fusion</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Enable fusion for CLAP model.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate used by CLAP model.</dd>
+<dt><strong><code>max_audio_length</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum audio length for CLAP model.</dd>
+<dt><strong><code>audio_stride</code></strong> :&ensp;<code>float</code></dt>
+<dd>Stride to use for getting a CLAP embedding on the full sequence.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to normalize the CLAP embedding.</dd>
+<dt><strong><code>text_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability of using text representation instead of audio at train time.</dd>
+<dt><strong><code>batch_size</code></strong> :&ensp;<code>Optional[int]</code></dt>
+<dd>Batch size for CLAP embedding computation.</dd>
+<dt><strong><code>autocast_dtype</code></strong> :&ensp;<code>str</code></dt>
+<dd>Autocast for the conditioner.</dd>
+<dt><strong><code>cache_path</code></strong> :&ensp;<code>Optional[str]</code></dt>
+<dd>Path for pre-computed embeddings caching.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional parameters for residual vector quantizer.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
+    &#34;&#34;&#34;Joint Embedding conditioner based on pre-trained CLAP model.
+
+    This CLAP-based conditioner supports a caching mechanism
+    over the computed embeddings for faster training.
+
+    Args:
+        dim (int): Dimension.
+        output_dim (int): Output dimension.
+        device (str): Device.
+        attribute (str): Attribute used by the conditioner.
+        quantize (bool): Whether to quantize the CLAP embedding.
+        n_q (int): Number of residual quantizers (used if quantize is true).
+        bins (int): Quantizers&#39; codebooks size (used if quantize is true).
+        checkpoint (str): Path to CLAP checkpoint.
+        model_arch (str): CLAP model architecture.
+        enable_fusion (bool): Enable fusion for CLAP model.
+        sample_rate (int): Sample rate used by CLAP model.
+        max_audio_length (float): Maximum audio length for CLAP model.
+        audio_stride (float): Stride to use for getting a CLAP embedding on the full sequence.
+        normalize (bool): Whether to normalize the CLAP embedding.
+        text_p (float): Probability of using text representation instead of audio at train time.
+        batch_size (Optional[int]): Batch size for CLAP embedding computation.
+        autocast_dtype (str): Autocast for the conditioner.
+        cache_path (Optional[str]): Path for pre-computed embeddings caching.
+        kwargs: Additional parameters for residual vector quantizer.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
+                 quantize: bool, n_q: int, bins: int, checkpoint: tp.Union[str, Path], model_arch: str,
+                 enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int,
+                 normalize: bool, text_p: bool, batch_size: tp.Optional[int] = None,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, cache_path: tp.Optional[str] = None, **kwargs):
+        try:
+            import laion_clap  # type: ignore
+        except ImportError:
+            raise ImportError(&#34;Please install CLAP to use the CLAPEmbeddingConditioner: &#39;pip install laion_clap&#39;&#34;)
+        warnings.warn(&#34;Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). &#34;
+                      &#34;Please retrain all models.&#34;)
+        checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
+        clap_tokenize = RobertaTokenizer.from_pretrained(&#39;roberta-base&#39;)
+        clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
+        load_clap_state_dict(clap_model, checkpoint)
+        clap_model.eval()
+        clap_model.to(device)
+        super().__init__(dim=dim, output_dim=output_dim, device=device, attribute=attribute,
+                         autocast_dtype=autocast_dtype, quantize=quantize, n_q=n_q, bins=bins,
+                         **kwargs)
+        self.checkpoint = checkpoint
+        self.enable_fusion = enable_fusion
+        self.model_arch = model_arch
+        self.clap: laion_clap.CLAP_Module
+        self.clap_tokenize: RobertaTokenizer
+        self.clap_sample_rate = sample_rate
+        self.clap_max_frames = int(self.clap_sample_rate * max_audio_length)
+        self.clap_stride = int(self.clap_sample_rate * audio_stride)
+        self.batch_size = batch_size or 1
+        self.normalize = normalize
+        self.text_p = text_p
+        self.__dict__[&#39;clap_tokenize&#39;] = clap_tokenize
+        self.__dict__[&#39;clap&#39;] = clap_model
+        self.wav_cache, self.text_cache = None, None
+        if cache_path is not None:
+            self.wav_cache = EmbeddingCache(Path(cache_path) / &#39;wav&#39;, self.device,
+                                            compute_embed_fn=self._get_wav_embedding_for_cache,
+                                            extract_embed_fn=self._extract_wav_embedding_chunk)
+            self.text_cache = EmbeddingCache(Path(cache_path) / &#39;text&#39;, self.device,
+                                             compute_embed_fn=self._get_text_embedding_for_cache)
+
+    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -&gt; dict:
+        # we use the default params from CLAP module here as well
+        return self.clap_tokenize(texts, padding=&#34;max_length&#34;, truncation=True, max_length=77, return_tensors=&#34;pt&#34;)
+
+    def _compute_text_embedding(self, text: tp.List[str]) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute text embedding from CLAP model on a given a batch of text.
+
+        Args:
+            text (list[str]): List of text for the batch, with B items.
+        Returns:
+            torch.Tensor: CLAP embedding derived from text, of shape [B, 1, D], with D the CLAP embedding dimension.
+        &#34;&#34;&#34;
+        with torch.no_grad():
+            embed = self.clap.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+            return embed.view(embed.size(0), 1, embed.size(-1))
+
+    def _get_text_embedding_for_cache(self, path: tp.Union[Path, str],
+                                      x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get text embedding function for the cache.&#34;&#34;&#34;
+        text = x.text[idx]
+        text = text if text is not None else &#34;&#34;
+        return self._compute_text_embedding([text])[0]
+
+    def _preprocess_wav(self, wav: torch.Tensor, length: torch.Tensor, sample_rates: tp.List[int]) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Preprocess wav to expected format by CLAP model.
+
+        Args:
+            wav (torch.Tensor): Audio wav, of shape [B, C, T].
+            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
+            sample_rates (list[int]): Sample rates for each sample in the batch
+        Returns:
+            torch.Tensor: Audio wav of shape [B, T].
+        &#34;&#34;&#34;
+        assert wav.dim() == 3, &#34;Expecting wav to be [B, C, T]&#34;
+        if sample_rates is not None:
+            _wav = []
+            for i, audio in enumerate(wav):
+                sr = sample_rates[i]
+                audio = convert_audio(audio, from_rate=sr, to_rate=self.clap_sample_rate, to_channels=1)
+                _wav.append(audio)
+            wav = torch.stack(_wav, dim=0)
+        wav = wav.mean(dim=1)
+        return wav
+
+    def _compute_wav_embedding(self, wav: torch.Tensor, length: torch.Tensor,
+                               sample_rates: tp.List[int], reduce_mean: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute audio wave embedding from CLAP model.
+
+        Since CLAP operates on a fixed sequence length audio inputs and we need to process longer audio sequences,
+        we calculate the wav embeddings on `clap_max_frames` windows with `clap_stride`-second stride and
+        average the resulting embeddings.
+
+        Args:
+            wav (torch.Tensor): Audio wav, of shape [B, C, T].
+            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
+            sample_rates (list[int]): Sample rates for each sample in the batch.
+            reduce_mean (bool): Whether to get the average tensor.
+        Returns:
+            torch.Tensor: Audio embedding of shape [B, F, D], F being the number of chunks, D the dimension.
+        &#34;&#34;&#34;
+        with torch.no_grad():
+            wav = self._preprocess_wav(wav, length, sample_rates)
+            B, T = wav.shape
+            if T &gt;= self.clap_max_frames:
+                wav = wav.unfold(-1, self.clap_max_frames, self.clap_stride)  # [B, F, T]
+            else:
+                wav = wav.view(-1, 1, T)  # [B, F, T] with F=1
+            wav = einops.rearrange(wav, &#39;b f t -&gt; (b f) t&#39;)
+            embed_list = []
+            for i in range(0, wav.size(0), self.batch_size):
+                _wav = wav[i:i+self.batch_size, ...]
+                _embed = self.clap.get_audio_embedding_from_data(_wav, use_tensor=True)
+                embed_list.append(_embed)
+            embed = torch.cat(embed_list, dim=0)
+            embed = einops.rearrange(embed, &#39;(b f) d -&gt; b f d&#39;, b=B)
+            if reduce_mean:
+                embed = embed.mean(dim=1, keepdim=True)
+            return embed  # [B, F, D] with F=1 if reduce_mean is True
+
+    def _get_wav_embedding_for_cache(self, path: tp.Union[str, Path],
+                                     x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute audio wave embedding for the cache.
+        The embedding is computed on a given audio read from file.
+
+        Args:
+            path (str or Path): Path to the full audio file.
+        Returns:
+            torch.Tensor: Single-item tensor of shape [F, D], F being the number of chunks, D the dimension.
+        &#34;&#34;&#34;
+        wav, sr = audio_read(path)  # [C, T]
+        wav = wav.unsqueeze(0).to(self.device)  # [1, C, T]
+        wav_len = torch.LongTensor([wav.shape[-1]]).to(self.device)
+        embed = self._compute_wav_embedding(wav, wav_len, [sr], reduce_mean=False)  # [B, F, D]
+        return embed.squeeze(0)  # [F, D]
+
+    def _extract_wav_embedding_chunk(self, full_embed: torch.Tensor, x: JointEmbedCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract the chunk of embedding matching the seek_time and length from the full CLAP audio embedding.
+
+        Args:
+            full_embed (torch.Tensor): CLAP embedding computed on the full wave, of shape [F, D].
+            x (JointEmbedCondition): Joint embedding condition for the full batch.
+            idx (int): Index considered for the given embedding to extract.
+        Returns:
+            torch.Tensor: Wav embedding averaged on sliding window, of shape [1, D].
+        &#34;&#34;&#34;
+        sample_rate = x.sample_rate[idx]
+        seek_time = x.seek_time[idx]
+        seek_time = 0. if seek_time is None else seek_time
+        clap_stride = int(self.clap_stride / self.clap_sample_rate) * sample_rate
+        end_seek_time = seek_time + self.clap_max_frames / self.clap_sample_rate
+        start_offset = int(seek_time * sample_rate // clap_stride)
+        end_offset = int(end_seek_time * sample_rate // clap_stride)
+        wav_embed = full_embed[start_offset:end_offset, ...]
+        wav_embed = wav_embed.mean(dim=0, keepdim=True)
+        return wav_embed.to(self.device)  # [F, D]
+
+    def _get_text_embedding(self, x: JointEmbedCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get CLAP embedding from a batch of text descriptions.&#34;&#34;&#34;
+        no_nullified_cond = x.wav.shape[-1] &gt; 1  # we don&#39;t want to read from cache when condition dropout
+        if self.text_cache is not None and no_nullified_cond:
+            assert all(p is not None for p in x.path), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            embed = self.text_cache.get_embed_from_cache(paths, x)
+        else:
+            text = [xi if xi is not None else &#34;&#34; for xi in x.text]
+            embed = self._compute_text_embedding(text)
+        if self.normalize:
+            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
+        return embed
+
+    def _get_wav_embedding(self, x: JointEmbedCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get CLAP embedding from a batch of audio tensors (and corresponding sample rates).&#34;&#34;&#34;
+        no_undefined_paths = all(p is not None for p in x.path)
+        no_nullified_cond = x.wav.shape[-1] &gt; 1  # we don&#39;t want to read from cache when condition dropout
+        if self.wav_cache is not None and no_undefined_paths and no_nullified_cond:
+            paths = [Path(p) for p in x.path if p is not None]
+            embed = self.wav_cache.get_embed_from_cache(paths, x)
+        else:
+            embed = self._compute_wav_embedding(x.wav, x.length, x.sample_rate, reduce_mean=True)
+        if self.normalize:
+            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
+        return embed
+
+    def tokenize(self, x: JointEmbedCondition) -&gt; JointEmbedCondition:
+        # Trying to limit as much as possible sync points when the cache is warm.
+        no_undefined_paths = all(p is not None for p in x.path)
+        if self.wav_cache is not None and no_undefined_paths:
+            assert all([p is not None for p in x.path]), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            self.wav_cache.populate_embed_cache(paths, x)
+        if self.text_cache is not None and no_undefined_paths:
+            assert all([p is not None for p in x.path]), &#34;Cache requires all JointEmbedCondition paths to be provided&#34;
+            paths = [Path(p) for p in x.path if p is not None]
+            self.text_cache.populate_embed_cache(paths, x)
+        return x
+
+    def _get_embed(self, x: JointEmbedCondition) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Extract shared latent representation from either the wav or the text using CLAP.&#34;&#34;&#34;
+        # decide whether to use text embedding at train time or not
+        use_text_embed = random.random() &lt; self.text_p
+        if self.training and not use_text_embed:
+            embed = self._get_wav_embedding(x)
+            empty_idx = torch.LongTensor([])  # we assume we always have the audio wav
+        else:
+            embed = self._get_text_embedding(x)
+            empty_idx = torch.LongTensor([i for i, xi in enumerate(x.text) if xi is None or xi == &#34;&#34;])
+        return embed, empty_idx</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner">JointEmbeddingConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner">JointEmbeddingConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner"><code class="flex name class">
+<span>class <span class="ident">ChromaStemConditioner</span></span>
+<span>(</span><span>output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int, duration: float, match_len_on_eval: bool = True, eval_wavs: Optional[str] = None, n_eval_wavs: int = 0, cache_path: Union[pathlib.Path, str, None] = None, device: Union[torch.device, str] = 'cpu', **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Chroma conditioner based on stems.
+The ChromaStemConditioner uses DEMUCS to first filter out drums and bass, as
+the drums and bass often dominate the chroma leading to the chroma features
+not containing information about the melody.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension for the conditioner.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate for the chroma extractor.</dd>
+<dt><strong><code>n_chroma</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of chroma bins for the chroma extractor.</dd>
+<dt><strong><code>radix2_exp</code></strong> :&ensp;<code>int</code></dt>
+<dd>Size of stft window for the chroma extractor (power of 2, e.g. 12 -&gt; 2^12).</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>int</code></dt>
+<dd>duration used during training. This is later used for correct padding
+in case we are using chroma as prefix.</dd>
+<dt><strong><code>match_len_on_eval</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>if True then all chromas are padded to the training
+duration. Defaults to False.</dd>
+<dt><strong><code>eval_wavs</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>path to a dataset manifest with waveform, this waveforms are used as
+conditions during eval (for cases where we don't want to leak test conditions like MusicCaps).
+Defaults to None.</dd>
+<dt><strong><code>n_eval_wavs</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>limits the number of waveforms used for conditioning. Defaults to 0.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code>, optional</dt>
+<dd>Device for the conditioner.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional parameters for the chroma extractor.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ChromaStemConditioner(WaveformConditioner):
+    &#34;&#34;&#34;Chroma conditioner based on stems.
+    The ChromaStemConditioner uses DEMUCS to first filter out drums and bass, as
+    the drums and bass often dominate the chroma leading to the chroma features
+    not containing information about the melody.
+
+    Args:
+        output_dim (int): Output dimension for the conditioner.
+        sample_rate (int): Sample rate for the chroma extractor.
+        n_chroma (int): Number of chroma bins for the chroma extractor.
+        radix2_exp (int): Size of stft window for the chroma extractor (power of 2, e.g. 12 -&gt; 2^12).
+        duration (int): duration used during training. This is later used for correct padding
+            in case we are using chroma as prefix.
+        match_len_on_eval (bool, optional): if True then all chromas are padded to the training
+            duration. Defaults to False.
+        eval_wavs (str, optional): path to a dataset manifest with waveform, this waveforms are used as
+            conditions during eval (for cases where we don&#39;t want to leak test conditions like MusicCaps).
+            Defaults to None.
+        n_eval_wavs (int, optional): limits the number of waveforms used for conditioning. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for the conditioner.
+        **kwargs: Additional parameters for the chroma extractor.
+    &#34;&#34;&#34;
+    def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
+                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
+                 n_eval_wavs: int = 0, cache_path: tp.Optional[tp.Union[str, Path]] = None,
+                 device: tp.Union[torch.device, str] = &#39;cpu&#39;, **kwargs):
+        from demucs import pretrained
+        super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
+        self.autocast = TorchAutocast(enabled=device != &#39;cpu&#39;, device_type=self.device, dtype=torch.float32)
+        self.sample_rate = sample_rate
+        self.match_len_on_eval = match_len_on_eval
+        if match_len_on_eval:
+            self._use_masking = False
+        self.duration = duration
+        self.__dict__[&#39;demucs&#39;] = pretrained.get_model(&#39;htdemucs&#39;).to(device)
+        stem_sources: list = self.demucs.sources  # type: ignore
+        self.stem_indices = torch.LongTensor([stem_sources.index(&#39;vocals&#39;), stem_sources.index(&#39;other&#39;)]).to(device)
+        self.chroma = ChromaExtractor(sample_rate=sample_rate, n_chroma=n_chroma,
+                                      radix2_exp=radix2_exp, **kwargs).to(device)
+        self.chroma_len = self._get_chroma_len()
+        self.eval_wavs: tp.Optional[torch.Tensor] = self._load_eval_wavs(eval_wavs, n_eval_wavs)
+        self.cache = None
+        if cache_path is not None:
+            self.cache = EmbeddingCache(Path(cache_path) / &#39;wav&#39;, self.device,
+                                        compute_embed_fn=self._get_full_chroma_for_cache,
+                                        extract_embed_fn=self._extract_chroma_chunk)
+
+    def _downsampling_factor(self) -&gt; int:
+        return self.chroma.winhop
+
+    def _load_eval_wavs(self, path: tp.Optional[str], num_samples: int) -&gt; tp.Optional[torch.Tensor]:
+        &#34;&#34;&#34;Load pre-defined waveforms from a json.
+        These waveforms will be used for chroma extraction during evaluation.
+        This is done to make the evaluation on MusicCaps fair (we shouldn&#39;t see the chromas of MusicCaps).
+        &#34;&#34;&#34;
+        if path is None:
+            return None
+
+        logger.info(f&#34;Loading evaluation wavs from {path}&#34;)
+        from audiocraft.data.audio_dataset import AudioDataset
+        dataset: AudioDataset = AudioDataset.from_meta(
+            path, segment_duration=self.duration, min_audio_duration=self.duration,
+            sample_rate=self.sample_rate, channels=1)
+
+        if len(dataset) &gt; 0:
+            eval_wavs = dataset.collater([dataset[i] for i in range(num_samples)]).to(self.device)
+            logger.info(f&#34;Using {len(eval_wavs)} evaluation wavs for chroma-stem conditioner&#34;)
+            return eval_wavs
+        else:
+            raise ValueError(&#34;Could not find evaluation wavs, check lengths of wavs&#34;)
+
+    def reset_eval_wavs(self, eval_wavs: tp.Optional[torch.Tensor]) -&gt; None:
+        self.eval_wavs = eval_wavs
+
+    def has_eval_wavs(self) -&gt; bool:
+        return self.eval_wavs is not None
+
+    def _sample_eval_wavs(self, num_samples: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample wavs from a predefined list.&#34;&#34;&#34;
+        assert self.eval_wavs is not None, &#34;Cannot sample eval wavs as no eval wavs provided.&#34;
+        total_eval_wavs = len(self.eval_wavs)
+        out = self.eval_wavs
+        if num_samples &gt; total_eval_wavs:
+            out = self.eval_wavs.repeat(num_samples // total_eval_wavs + 1, 1, 1)
+        return out[torch.randperm(len(out))][:num_samples]
+
+    def _get_chroma_len(self) -&gt; int:
+        &#34;&#34;&#34;Get length of chroma during training.&#34;&#34;&#34;
+        dummy_wav = torch.zeros((1, int(self.sample_rate * self.duration)), device=self.device)
+        dummy_chr = self.chroma(dummy_wav)
+        return dummy_chr.shape[1]
+
+    @torch.no_grad()
+    def _get_stemmed_wav(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get parts of the wav that holds the melody, extracting the main stems from the wav.&#34;&#34;&#34;
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+        with self.autocast:
+            wav = convert_audio(
+                wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
+            stems = apply_model(self.demucs, wav, device=self.device)
+            stems = stems[:, self.stem_indices]  # extract relevant stems for melody conditioning
+            mix_wav = stems.sum(1)  # merge extracted stems to single waveform
+            mix_wav = convert_audio(mix_wav, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
+            return mix_wav
+
+    @torch.no_grad()
+    def _extract_chroma(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract chroma features from the waveform.&#34;&#34;&#34;
+        with self.autocast:
+            return self.chroma(wav)
+
+    @torch.no_grad()
+    def _compute_wav_embedding(self, wav: torch.Tensor, sample_rate: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Compute wav embedding, applying stem and chroma extraction.&#34;&#34;&#34;
+        # avoid 0-size tensors when we are working with null conds
+        if wav.shape[-1] == 1:
+            return self._extract_chroma(wav)
+        stems = self._get_stemmed_wav(wav, sample_rate)
+        chroma = self._extract_chroma(stems)
+        return chroma
+
+    @torch.no_grad()
+    def _get_full_chroma_for_cache(self, path: tp.Union[str, Path], x: WavCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract chroma from the whole audio waveform at the given path.&#34;&#34;&#34;
+        wav, sr = audio_read(path)
+        wav = wav[None].to(self.device)
+        wav = convert_audio(wav, sr, self.sample_rate, to_channels=1)
+        chroma = self._compute_wav_embedding(wav, self.sample_rate)[0]
+        return chroma
+
+    def _extract_chroma_chunk(self, full_chroma: torch.Tensor, x: WavCondition, idx: int) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Extract a chunk of chroma from the full chroma derived from the full waveform.&#34;&#34;&#34;
+        wav_length = x.wav.shape[-1]
+        seek_time = x.seek_time[idx]
+        assert seek_time is not None, (
+            &#34;WavCondition seek_time is required &#34;
+            &#34;when extracting chroma chunks from pre-computed chroma.&#34;)
+        full_chroma = full_chroma.float()
+        frame_rate = self.sample_rate / self._downsampling_factor()
+        target_length = int(frame_rate * wav_length / self.sample_rate)
+        index = int(frame_rate * seek_time)
+        out = full_chroma[index: index + target_length]
+        out = F.pad(out[None], (0, 0, 0, target_length - out.shape[0]))[0]
+        return out.to(self.device)
+
+    @torch.no_grad()
+    def _get_wav_embedding(self, x: WavCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get the wav embedding from the WavCondition.
+        The conditioner will either extract the embedding on-the-fly computing it from the condition wav directly
+        or will rely on the embedding cache to load the pre-computed embedding if relevant.
+        &#34;&#34;&#34;
+        sampled_wav: tp.Optional[torch.Tensor] = None
+        if not self.training and self.eval_wavs is not None:
+            warn_once(logger, &#34;Using precomputed evaluation wavs!&#34;)
+            sampled_wav = self._sample_eval_wavs(len(x.wav))
+
+        no_undefined_paths = all(p is not None for p in x.path)
+        no_nullified_cond = x.wav.shape[-1] &gt; 1
+        if sampled_wav is not None:
+            chroma = self._compute_wav_embedding(sampled_wav, self.sample_rate)
+        elif self.cache is not None and no_undefined_paths and no_nullified_cond:
+            paths = [Path(p) for p in x.path if p is not None]
+            chroma = self.cache.get_embed_from_cache(paths, x)
+        else:
+            assert all(sr == x.sample_rate[0] for sr in x.sample_rate), &#34;All sample rates in batch should be equal.&#34;
+            chroma = self._compute_wav_embedding(x.wav, x.sample_rate[0])
+
+        if self.match_len_on_eval:
+            B, T, C = chroma.shape
+            if T &gt; self.chroma_len:
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#34;Chroma was truncated to match length! ({T} -&gt; {chroma.shape[1]})&#34;)
+            elif T &lt; self.chroma_len:
+                n_repeat = int(math.ceil(self.chroma_len / T))
+                chroma = chroma.repeat(1, n_repeat, 1)
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#34;Chroma was repeated to match length! ({T} -&gt; {chroma.shape[1]})&#34;)
+
+        return chroma
+
+    def tokenize(self, x: WavCondition) -&gt; WavCondition:
+        &#34;&#34;&#34;Apply WavConditioner tokenization and populate cache if needed.&#34;&#34;&#34;
+        x = super().tokenize(x)
+        no_undefined_paths = all(p is not None for p in x.path)
+        if self.cache is not None and no_undefined_paths:
+            paths = [Path(p) for p in x.path if p is not None]
+            self.cache.populate_embed_cache(paths, x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.has_eval_wavs"><code class="name flex">
+<span>def <span class="ident">has_eval_wavs</span></span>(<span>self) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def has_eval_wavs(self) -&gt; bool:
+    return self.eval_wavs is not None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.reset_eval_wavs"><code class="name flex">
+<span>def <span class="ident">reset_eval_wavs</span></span>(<span>self, eval_wavs: Optional[torch.Tensor]) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def reset_eval_wavs(self, eval_wavs: tp.Optional[torch.Tensor]) -&gt; None:
+    self.eval_wavs = eval_wavs</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.tokenize"><code class="name flex">
+<span>def <span class="ident">tokenize</span></span>(<span>self, x: <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>) ‑> <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply WavConditioner tokenization and populate cache if needed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokenize(self, x: WavCondition) -&gt; WavCondition:
+    &#34;&#34;&#34;Apply WavConditioner tokenization and populate cache if needed.&#34;&#34;&#34;
+    x = super().tokenize(x)
+    no_undefined_paths = all(p is not None for p in x.path)
+    if self.cache is not None and no_undefined_paths:
+        paths = [Path(p) for p in x.path if p is not None]
+        self.cache.populate_embed_cache(paths, x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.forward" href="#audiocraft.modules.conditioners.WaveformConditioner.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout"><code class="flex name class">
+<span>class <span class="ident">ClassifierFreeGuidanceDropout</span></span>
+<span>(</span><span>p: float, seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Classifier Free Guidance dropout.
+All attributes are dropped with the same probability.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability to apply condition dropout during training.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ClassifierFreeGuidanceDropout(DropoutModule):
+    &#34;&#34;&#34;Classifier Free Guidance dropout.
+    All attributes are dropped with the same probability.
+
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: float, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.p = p
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training:
+            return samples
+
+        # decide on which attributes to drop in a batched fashion
+        drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+        if not drop:
+            return samples
+
+        # nullify conditions of all attributes
+        samples = deepcopy(samples)
+        for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+            for sample in samples:
+                for condition in sample.attributes[condition_type]:
+                    dropout_condition(sample, condition_type, condition)
+        return samples
+
+    def __repr__(self):
+        return f&#34;ClassifierFreeGuidanceDropout(p={self.p})&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, samples: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>list[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>list[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions after all attributes were set to None.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+    &#34;&#34;&#34;
+    Args:
+        samples (list[ConditioningAttributes]): List of conditions.
+    Returns:
+        list[ConditioningAttributes]: List of conditions after all attributes were set to None.
+    &#34;&#34;&#34;
+    if not self.training:
+        return samples
+
+    # decide on which attributes to drop in a batched fashion
+    drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+    if not drop:
+        return samples
+
+    # nullify conditions of all attributes
+    samples = deepcopy(samples)
+    for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+        for sample in samples:
+            for condition in sample.attributes[condition_type]:
+                dropout_condition(sample, condition_type, condition)
+    return samples</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser"><code class="flex name class">
+<span>class <span class="ident">ConditionFuser</span></span>
+<span>(</span><span>fuse2cond: Dict[str, List[str]], cross_attention_pos_emb: bool = False, cross_attention_pos_emb_scale: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Condition fuser handles the logic to combine the different conditions
+to the actual model input.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>fuse2cond</code></strong> :&ensp;<code>tp.Dict[str, str]</code></dt>
+<dd>A dictionary that says how to fuse
+each condition. For example:
+{
+"prepend": ["description"],
+"sum": ["genre", "bpm"],
+"cross": ["description"],
+}</dd>
+<dt><strong><code>cross_attention_pos_emb</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Use positional embeddings in cross attention.</dd>
+<dt><strong><code>cross_attention_pos_emb_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Scale for positional embeddings in cross attention if used.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditionFuser(StreamingModule):
+    &#34;&#34;&#34;Condition fuser handles the logic to combine the different conditions
+    to the actual model input.
+
+    Args:
+        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
+            each condition. For example:
+            {
+                &#34;prepend&#34;: [&#34;description&#34;],
+                &#34;sum&#34;: [&#34;genre&#34;, &#34;bpm&#34;],
+                &#34;cross&#34;: [&#34;description&#34;],
+            }
+        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
+        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
+    &#34;&#34;&#34;
+    FUSING_METHODS = [&#34;sum&#34;, &#34;prepend&#34;, &#34;cross&#34;, &#34;input_interpolate&#34;]
+
+    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
+                 cross_attention_pos_emb_scale: float = 1.0):
+        super().__init__()
+        assert all(
+            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
+        ), f&#34;Got invalid fuse method, allowed methods: {self.FUSING_METHODS}&#34;
+        self.cross_attention_pos_emb = cross_attention_pos_emb
+        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
+        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
+        self.cond2fuse: tp.Dict[str, str] = {}
+        for fuse_method, conditions in fuse2cond.items():
+            for condition in conditions:
+                self.cond2fuse[condition] = fuse_method
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        conditions: tp.Dict[str, ConditionType]
+    ) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+        Args:
+            input (torch.Tensor): Transformer input.
+            conditions (dict[str, ConditionType]): Dict of conditions.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
+                after the conditions have been fused. The second output tensor is the tensor
+                used for cross-attention or None if no cross attention inputs exist.
+        &#34;&#34;&#34;
+        B, T, _ = input.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            first_step = False
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            first_step = True
+            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+            f&#34;given conditions contain unknown attributes for fuser, &#34; \
+            f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+        cross_attention_output = None
+        for cond_type, (cond, cond_mask) in conditions.items():
+            op = self.cond2fuse[cond_type]
+            if op == &#39;sum&#39;:
+                input += cond
+            elif op == &#39;input_interpolate&#39;:
+                cond = einops.rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+                cond = F.interpolate(cond, size=input.shape[1])
+                input += einops.rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+            elif op == &#39;prepend&#39;:
+                if first_step:
+                    input = torch.cat([cond, input], dim=1)
+            elif op == &#39;cross&#39;:
+                if cross_attention_output is not None:
+                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+                else:
+                    cross_attention_output = cond
+            else:
+                raise ValueError(f&#34;unknown op ({op})&#34;)
+
+        if self.cross_attention_pos_emb and cross_attention_output is not None:
+            positions = torch.arange(
+                cross_attention_output.shape[1],
+                device=cross_attention_output.device
+            ).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return input, cross_attention_output</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS"><code class="name">var <span class="ident">FUSING_METHODS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, input: torch.Tensor, conditions: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Fuse the conditions to the provided model input.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>input</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Transformer input.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>dict[str, ConditionType]</code></dt>
+<dd>Dict of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tuple[torch.Tensor, torch.Tensor]</code></dt>
+<dd>The first tensor is the transformer input
+after the conditions have been fused. The second output tensor is the tensor
+used for cross-attention or None if no cross attention inputs exist.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(
+    self,
+    input: torch.Tensor,
+    conditions: tp.Dict[str, ConditionType]
+) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+    Args:
+        input (torch.Tensor): Transformer input.
+        conditions (dict[str, ConditionType]): Dict of conditions.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
+            after the conditions have been fused. The second output tensor is the tensor
+            used for cross-attention or None if no cross attention inputs exist.
+    &#34;&#34;&#34;
+    B, T, _ = input.shape
+
+    if &#39;offsets&#39; in self._streaming_state:
+        first_step = False
+        offsets = self._streaming_state[&#39;offsets&#39;]
+    else:
+        first_step = True
+        offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+    assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+        f&#34;given conditions contain unknown attributes for fuser, &#34; \
+        f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+    cross_attention_output = None
+    for cond_type, (cond, cond_mask) in conditions.items():
+        op = self.cond2fuse[cond_type]
+        if op == &#39;sum&#39;:
+            input += cond
+        elif op == &#39;input_interpolate&#39;:
+            cond = einops.rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+            cond = F.interpolate(cond, size=input.shape[1])
+            input += einops.rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+        elif op == &#39;prepend&#39;:
+            if first_step:
+                input = torch.cat([cond, input], dim=1)
+        elif op == &#39;cross&#39;:
+            if cross_attention_output is not None:
+                cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+            else:
+                cross_attention_output = cond
+        else:
+            raise ValueError(f&#34;unknown op ({op})&#34;)
+
+    if self.cross_attention_pos_emb and cross_attention_output is not None:
+        positions = torch.arange(
+            cross_attention_output.shape[1],
+            device=cross_attention_output.device
+        ).view(1, -1, 1)
+        pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+        cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+    if self._is_streaming:
+        self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+    return input, cross_attention_output</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes"><code class="flex name class">
+<span>class <span class="ident">ConditioningAttributes</span></span>
+<span>(</span><span>text: Dict[str, Optional[str]] = &lt;factory&gt;, wav: Dict[str, <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>] = &lt;factory&gt;, joint_embed: Dict[str, <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a>] = &lt;factory&gt;)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ConditioningAttributes(text: Dict[str, Union[str, NoneType]] = <factory>, wav: Dict[str, audiocraft.modules.conditioners.WavCondition] = <factory>, joint_embed: Dict[str, audiocraft.modules.conditioners.JointEmbedCondition] = <factory>)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditioningAttributes:
+    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    @property
+    def text_attributes(self):
+        return self.text.keys()
+
+    @property
+    def wav_attributes(self):
+        return self.wav.keys()
+
+    @property
+    def joint_embed_attributes(self):
+        return self.joint_embed.keys()
+
+    @property
+    def attributes(self):
+        return {
+            &#34;text&#34;: self.text_attributes,
+            &#34;wav&#34;: self.wav_attributes,
+            &#34;joint_embed&#34;: self.joint_embed_attributes,
+        }
+
+    def to_flat_dict(self):
+        return {
+            **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+            **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+            **{f&#34;joint_embed.{k}&#34;: v for k, v in self.joint_embed.items()}
+        }
+
+    @classmethod
+    def from_flat_dict(cls, x):
+        out = cls()
+        for k, v in x.items():
+            kind, att = k.split(&#34;.&#34;)
+            out[kind][att] = v
+        return out</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.joint_embed"><code class="name">var <span class="ident">joint_embed</span> : Dict[str, <a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.text"><code class="name">var <span class="ident">text</span> : Dict[str, Optional[str]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.wav"><code class="name">var <span class="ident">wav</span> : Dict[str, <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict"><code class="name flex">
+<span>def <span class="ident">from_flat_dict</span></span>(<span>x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_flat_dict(cls, x):
+    out = cls()
+    for k, v in x.items():
+        kind, att = k.split(&#34;.&#34;)
+        out[kind][att] = v
+    return out</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.attributes"><code class="name">var <span class="ident">attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def attributes(self):
+    return {
+        &#34;text&#34;: self.text_attributes,
+        &#34;wav&#34;: self.wav_attributes,
+        &#34;joint_embed&#34;: self.joint_embed_attributes,
+    }</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.joint_embed_attributes"><code class="name">var <span class="ident">joint_embed_attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def joint_embed_attributes(self):
+    return self.joint_embed.keys()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.text_attributes"><code class="name">var <span class="ident">text_attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def text_attributes(self):
+    return self.text.keys()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes"><code class="name">var <span class="ident">wav_attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def wav_attributes(self):
+    return self.wav.keys()</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict"><code class="name flex">
+<span>def <span class="ident">to_flat_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_flat_dict(self):
+    return {
+        **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+        **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+        **{f&#34;joint_embed.{k}&#34;: v for k, v in self.joint_embed.items()}
+    }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider"><code class="flex name class">
+<span>class <span class="ident">ConditioningProvider</span></span>
+<span>(</span><span>conditioners: Dict[str, <a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a>], device: Union[torch.device, str] = 'cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Prepare and provide conditions given all the supported conditioners.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>conditioners</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Dictionary of conditioners.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code>, optional</dt>
+<dd>Device for conditioners and output condition types.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditioningProvider(nn.Module):
+    &#34;&#34;&#34;Prepare and provide conditions given all the supported conditioners.
+
+    Args:
+        conditioners (dict): Dictionary of conditioners.
+        device (torch.device or str, optional): Device for conditioners and output condition types.
+    &#34;&#34;&#34;
+    def __init__(self, conditioners: tp.Dict[str, BaseConditioner], device: tp.Union[torch.device, str] = &#34;cpu&#34;):
+        super().__init__()
+        self.device = device
+        self.conditioners = nn.ModuleDict(conditioners)
+
+    @property
+    def joint_embed_conditions(self):
+        return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
+
+    @property
+    def has_joint_embed_conditions(self):
+        return len(self.joint_embed_conditions) &gt; 0
+
+    @property
+    def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
+
+    @property
+    def wav_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
+
+    @property
+    def has_wav_condition(self):
+        return len(self.wav_conditions) &gt; 0
+
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+        &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+        Args:
+            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
+                text and wav conditions.
+        &#34;&#34;&#34;
+        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
+            &#34;Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34;,
+            f&#34; but types were {set([type(x) for x in inputs])}&#34;
+        )
+
+        output = {}
+        text = self._collate_text(inputs)
+        wavs = self._collate_wavs(inputs)
+        joint_embeds = self._collate_joint_embeds(inputs)
+
+        assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
+            f&#34;Got an unexpected attribute! Expected {self.conditioners.keys()}, &#34;,
+            f&#34;got {text.keys(), wavs.keys(), joint_embeds.keys()}&#34;
+        )
+
+        for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+
+    def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+        &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
+        The output is for example:
+        {
+            &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+            &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+            ...
+        }
+
+        Args:
+            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+        &#34;&#34;&#34;
+        output = {}
+        for attribute, inputs in tokenized.items():
+            condition, mask = self.conditioners[attribute](inputs)
+            output[attribute] = (condition, mask)
+        return output
+
+    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.List[tp.Optional[str]]]:
+        &#34;&#34;&#34;Given a list of ConditioningAttributes objects, compile a dictionary where the keys
+        are the attributes and the values are the aggregated input per attribute.
+        For example:
+        Input:
+        [
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Rock&#34;, &#34;description&#34;: &#34;A rock song with a guitar solo&#34;}, wav=...),
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Hip-hop&#34;, &#34;description&#34;: &#34;A hip-hop verse&#34;}, wav=...),
+        ]
+        Output:
+        {
+            &#34;genre&#34;: [&#34;Rock&#34;, &#34;Hip-hop&#34;],
+            &#34;description&#34;: [&#34;A rock song with a guitar solo&#34;, &#34;A hip-hop verse&#34;]
+        }
+
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
+        &#34;&#34;&#34;
+        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
+        texts = [x.text for x in samples]
+        for text in texts:
+            for condition in self.text_conditions:
+                out[condition].append(text[condition])
+        return out
+
+    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, WavCondition]:
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we fetch similar wavs,
+        and the values are Tensors of wavs according to said attributes.
+
+        *Note*: by the time the samples reach this function, each sample should have some waveform
+        inside the &#34;wav&#34; attribute. It should be either:
+        1. A real waveform
+        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
+        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
+
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
+        &#34;&#34;&#34;
+        wavs = defaultdict(list)
+        lengths = defaultdict(list)
+        sample_rates = defaultdict(list)
+        paths = defaultdict(list)
+        seek_times = defaultdict(list)
+        out: tp.Dict[str, WavCondition] = {}
+
+        for sample in samples:
+            for attribute in self.wav_conditions:
+                wav, length, sample_rate, path, seek_time = sample.wav[attribute]
+                assert wav.dim() == 3, f&#34;Got wav with dim={wav.dim()}, but expected 3 [1, C, T]&#34;
+                assert wav.size(0) == 1, f&#34;Got wav [B, C, T] with shape={wav.shape}, but expected B == 1&#34;
+                # mono-channel conditioning
+                wav = wav.mean(1, keepdim=True)  # [1, 1, T]
+                wavs[attribute].append(wav.flatten())  # [T]
+                lengths[attribute].append(length)
+                sample_rates[attribute].extend(sample_rate)
+                paths[attribute].extend(path)
+                seek_times[attribute].extend(seek_time)
+
+        # stack all wavs to a single tensor
+        for attribute in self.wav_conditions:
+            stacked_wav, _ = collate(wavs[attribute], dim=0)
+            out[attribute] = WavCondition(
+                stacked_wav.unsqueeze(1), torch.cat(lengths[attribute]), sample_rates[attribute],
+                paths[attribute], seek_times[attribute])
+
+        return out
+
+    def _collate_joint_embeds(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, JointEmbedCondition]:
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we compute joint embeddings,
+        and the values are Tensors of pre-computed embeddings and the corresponding text attributes.
+
+        Args:
+            samples (list[ConditioningAttributes]): List of ConditioningAttributes samples.
+        Returns:
+            A dictionary mapping an attribute name to joint embeddings.
+        &#34;&#34;&#34;
+        texts = defaultdict(list)
+        wavs = defaultdict(list)
+        lengths = defaultdict(list)
+        sample_rates = defaultdict(list)
+        paths = defaultdict(list)
+        seek_times = defaultdict(list)
+        channels: int = 0
+
+        out = {}
+        for sample in samples:
+            for attribute in self.joint_embed_conditions:
+                wav, text, length, sample_rate, path, seek_time = sample.joint_embed[attribute]
+                assert wav.dim() == 3
+                if channels == 0:
+                    channels = wav.size(1)
+                else:
+                    assert channels == wav.size(1), &#34;not all audio has same number of channels in batch&#34;
+                assert wav.size(0) == 1, &#34;Expecting single-wav batch in the collate method&#34;
+                wav = einops.rearrange(wav, &#34;b c t -&gt; (b c t)&#34;)  # [1, C, T] =&gt; [C * T]
+                wavs[attribute].append(wav)
+                texts[attribute].extend(text)
+                lengths[attribute].append(length)
+                sample_rates[attribute].extend(sample_rate)
+                paths[attribute].extend(path)
+                seek_times[attribute].extend(seek_time)
+
+        for attribute in self.joint_embed_conditions:
+            stacked_texts = texts[attribute]
+            stacked_paths = paths[attribute]
+            stacked_seek_times = seek_times[attribute]
+            stacked_wavs = pad_sequence(wavs[attribute]).to(self.device)
+            stacked_wavs = einops.rearrange(stacked_wavs, &#34;(c t) b -&gt; b c t&#34;, c=channels)
+            stacked_sample_rates = sample_rates[attribute]
+            stacked_lengths = torch.cat(lengths[attribute]).to(self.device)
+            assert stacked_lengths.size(0) == stacked_wavs.size(0)
+            assert len(stacked_sample_rates) == stacked_wavs.size(0)
+            assert len(stacked_texts) == stacked_wavs.size(0)
+            out[attribute] = JointEmbedCondition(
+                text=stacked_texts, wav=stacked_wavs,
+                length=stacked_lengths, sample_rate=stacked_sample_rates,
+                path=stacked_paths, seek_time=stacked_seek_times)
+
+        return out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.has_joint_embed_conditions"><code class="name">var <span class="ident">has_joint_embed_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def has_joint_embed_conditions(self):
+    return len(self.joint_embed_conditions) &gt; 0</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition"><code class="name">var <span class="ident">has_wav_condition</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def has_wav_condition(self):
+    return len(self.wav_conditions) &gt; 0</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.joint_embed_conditions"><code class="name">var <span class="ident">joint_embed_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def joint_embed_conditions(self):
+    return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.text_conditions"><code class="name">var <span class="ident">text_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def text_conditions(self):
+    return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.wav_conditions"><code class="name">var <span class="ident">wav_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def wav_conditions(self):
+    return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, tokenized: Dict[str, Any]) ‑> Dict[str, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute pairs of <code>(embedding, mask)</code> using the configured conditioners and the tokenized representations.
+The output is for example:
+{
+"genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+"description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+&hellip;
+}</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>tokenized</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Dict of tokenized representations as returned by <code>tokenize()</code>.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+    &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
+    The output is for example:
+    {
+        &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+        &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+        ...
+    }
+
+    Args:
+        tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+    &#34;&#34;&#34;
+    output = {}
+    for attribute, inputs in tokenized.items():
+        condition, mask = self.conditioners[attribute](inputs)
+        output[attribute] = (condition, mask)
+    return output</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.tokenize"><code class="name flex">
+<span>def <span class="ident">tokenize</span></span>(<span>self, inputs: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> Dict[str, Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+This should be called before starting any real GPU work to avoid synchronization points.
+This will return a dict matching conditioner names to their arbitrary tokenized representations.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>inputs</code></strong> :&ensp;<code>list[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of ConditioningAttributes objects containing
+text and wav conditions.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+    &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+    This should be called before starting any real GPU work to avoid synchronization points.
+    This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+    Args:
+        inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
+            text and wav conditions.
+    &#34;&#34;&#34;
+    assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
+        &#34;Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34;,
+        f&#34; but types were {set([type(x) for x in inputs])}&#34;
+    )
+
+    output = {}
+    text = self._collate_text(inputs)
+    wavs = self._collate_wavs(inputs)
+    joint_embeds = self._collate_joint_embeds(inputs)
+
+    assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
+        f&#34;Got an unexpected attribute! Expected {self.conditioners.keys()}, &#34;,
+        f&#34;got {text.keys(), wavs.keys(), joint_embeds.keys()}&#34;
+    )
+
+    for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
+        output[attribute] = self.conditioners[attribute].tokenize(batch)
+    return output</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule"><code class="flex name class">
+<span>class <span class="ident">DropoutModule</span></span>
+<span>(</span><span>seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base module for all dropout modules.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DropoutModule(nn.Module):
+    &#34;&#34;&#34;Base module for all dropout modules.&#34;&#34;&#34;
+    def __init__(self, seed: int = 1234):
+        super().__init__()
+        self.rng = torch.Generator()
+        self.rng.manual_seed(seed)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.AttributeDropout" href="#audiocraft.modules.conditioners.AttributeDropout">AttributeDropout</a></li>
+<li><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout">ClassifierFreeGuidanceDropout</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.DropoutModule.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.DropoutModule.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition"><code class="flex name class">
+<span>class <span class="ident">JointEmbedCondition</span></span>
+<span>(</span><span>wav: torch.Tensor, text: List[Optional[str]], length: torch.Tensor, sample_rate: List[int], path: List[Optional[str]] = [], seek_time: List[Optional[float]] = [])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>JointEmbedCondition(wav, text, length, sample_rate, path, seek_time)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class JointEmbedCondition(tp.NamedTuple):
+    wav: torch.Tensor
+    text: tp.List[tp.Optional[str]]
+    length: torch.Tensor
+    sample_rate: tp.List[int]
+    path: tp.List[tp.Optional[str]] = []
+    seek_time: tp.List[tp.Optional[float]] = []</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.length"><code class="name">var <span class="ident">length</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 2</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.path"><code class="name">var <span class="ident">path</span> : List[Optional[str]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 4</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : List[int]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 3</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.seek_time"><code class="name">var <span class="ident">seek_time</span> : List[Optional[float]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 5</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.text"><code class="name">var <span class="ident">text</span> : List[Optional[str]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbedCondition.wav"><code class="name">var <span class="ident">wav</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbeddingConditioner"><code class="flex name class">
+<span>class <span class="ident">JointEmbeddingConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int, device: str, attribute: str, autocast_dtype: Optional[str] = 'float32', quantize: bool = True, n_q: int = 12, bins: int = 1024, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Joint embedding conditioning supporting both audio or text conditioning.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code></dt>
+<dd>Device.</dd>
+<dt><strong><code>attribute</code></strong> :&ensp;<code>str</code></dt>
+<dd>Attribute used by the conditioner.</dd>
+<dt><strong><code>autocast_dtype</code></strong> :&ensp;<code>str</code></dt>
+<dd>Autocast for the conditioner.</dd>
+<dt><strong><code>quantize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to quantize the CLAP embedding.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of residual quantizers (used if quantize is true).</dd>
+<dt><strong><code>bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Quantizers' codebooks size (used if quantize is true).</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional parameters for residual vector quantizer.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class JointEmbeddingConditioner(BaseConditioner):
+    &#34;&#34;&#34;Joint embedding conditioning supporting both audio or text conditioning.
+
+    Args:
+        dim (int): Dimension.
+        output_dim (int): Output dimension.
+        device (str): Device.
+        attribute (str): Attribute used by the conditioner.
+        autocast_dtype (str): Autocast for the conditioner.
+        quantize (bool): Whether to quantize the CLAP embedding.
+        n_q (int): Number of residual quantizers (used if quantize is true).
+        bins (int): Quantizers&#39; codebooks size (used if quantize is true).
+        kwargs: Additional parameters for residual vector quantizer.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, quantize: bool = True,
+                 n_q: int = 12, bins: int = 1024, **kwargs):
+        super().__init__(dim=dim, output_dim=output_dim)
+        self.device = device
+        self.attribute = attribute
+        if autocast_dtype is None or device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            logger.warning(&#34;JointEmbeddingConditioner has no autocast, this might lead to NaN.&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;JointEmbeddingConditioner will be evaluated with autocast as {autocast_dtype}.&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # residual vector quantizer to discretize the conditioned embedding
+        self.quantizer: tp.Optional[ResidualVectorQuantizer] = None
+        if quantize:
+            self.quantizer = ResidualVectorQuantizer(dim, n_q=n_q, bins=bins, **kwargs)
+
+    def _get_embed(self, x: JointEmbedCondition) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Get joint embedding in latent space from the inputs.
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: Tensor for the latent embedding
+                and corresponding empty indexes.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, x: JointEmbedCondition) -&gt; ConditionType:
+        with self.autocast:
+            embed, empty_idx = self._get_embed(x)
+            if self.quantizer is not None:
+                embed = embed.view(-1, self.dim, 1)
+                q_res = self.quantizer(embed, frame_rate=1)
+                out_embed = q_res.x.view(-1, self.dim)
+            else:
+                out_embed = embed
+            out_embed = self.output_proj(out_embed).view(-1, 1, self.output_dim)
+            mask = torch.ones(*out_embed.shape[:2], device=out_embed.device)
+            mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+            out_embed = (out_embed * mask.unsqueeze(-1))
+            return out_embed, mask
+
+    def tokenize(self, x: JointEmbedCondition) -&gt; JointEmbedCondition:
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.CLAPEmbeddingConditioner" href="#audiocraft.modules.conditioners.CLAPEmbeddingConditioner">CLAPEmbeddingConditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.JointEmbeddingConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbeddingConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.JointEmbeddingConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner"><code class="flex name class">
+<span>class <span class="ident">LUTConditioner</span></span>
+<span>(</span><span>n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Lookup table TextConditioner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of bins.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model (text-encoder/LUT).</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+<dt><strong><code>tokenizer</code></strong> :&ensp;<code>str</code></dt>
+<dd>Name of the tokenizer.</dd>
+<dt><strong><code>pad_idx</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Index for padding token. Defaults to 0.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LUTConditioner(TextConditioner):
+    &#34;&#34;&#34;Lookup table TextConditioner.
+
+    Args:
+        n_bins (int): Number of bins.
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+        tokenizer (str): Name of the tokenizer.
+        pad_idx (int, optional): Index for padding token. Defaults to 0.
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
+        super().__init__(dim, output_dim)
+        self.embed = nn.Embedding(n_bins, dim)
+        self.tokenizer: Tokenizer
+        if tokenizer == &#39;whitespace&#39;:
+            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
+        elif tokenizer == &#39;noop&#39;:
+            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
+        else:
+            raise ValueError(f&#34;unrecognized tokenizer `{tokenizer}`.&#34;)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        device = self.embed.weight.device
+        tokens, mask = self.tokenizer(x)
+        tokens, mask = tokens.to(device), mask.to(device)
+        return tokens, mask
+
+    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -&gt; ConditionType:
+        tokens, mask = inputs
+        embeds = self.embed(tokens)
+        embeds = self.output_proj(embeds)
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.NoopTokenizer"><code class="flex name class">
+<span>class <span class="ident">NoopTokenizer</span></span>
+<span>(</span><span>n_bins: int, pad_idx: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+strings, so "Jeff Buckley" will get it's own index. Whereas WhiteSpaceTokenizer will
+split it to ["Jeff", "Buckley"] and return an index per word.</p>
+<p>For example:
+["Queen", "ABBA", "Jeff Buckley"] =&gt; [43, 55, 101]
+["Metal", "Rock", "Classical"] =&gt; [0, 223, 51]</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NoopTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+    strings, so &#34;Jeff Buckley&#34; will get it&#39;s own index. Whereas WhiteSpaceTokenizer will
+    split it to [&#34;Jeff&#34;, &#34;Buckley&#34;] and return an index per word.
+
+    For example:
+    [&#34;Queen&#34;, &#34;ABBA&#34;, &#34;Jeff Buckley&#34;] =&gt; [43, 55, 101]
+    [&#34;Metal&#34;, &#34;Rock&#34;, &#34;Classical&#34;] =&gt; [0, 223, 51]
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, pad_idx: int = 0):
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        output, lengths = [], []
+        for text in texts:
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(self.pad_idx)
+                lengths.append(0)
+            else:
+                output.append(hash_trick(text, self.n_bins))
+                lengths.append(1)
+
+        tokens = torch.LongTensor(output).unsqueeze(1)
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        return tokens, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes"><code class="flex name class">
+<span>class <span class="ident">SegmentWithAttributes</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int, channels: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all dataclasses that are used for conditioning.
+All child classes should implement <code>to_condition_attributes</code> that converts
+the existing attributes to a dataclass of type ConditioningAttributes.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SegmentWithAttributes(SegmentInfo):
+    &#34;&#34;&#34;Base class for all dataclasses that are used for conditioning.
+    All child classes should implement `to_condition_attributes` that converts
+    the existing attributes to a dataclass of type ConditioningAttributes.
+    &#34;&#34;&#34;
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.info_audio_dataset.AudioInfo" href="../data/info_audio_dataset.html#audiocraft.data.info_audio_dataset.AudioInfo">AudioInfo</a></li>
+<li><a title="audiocraft.data.sound_dataset.SoundInfo" href="../data/sound_dataset.html#audiocraft.data.sound_dataset.SoundInfo">SoundInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.meta"><code class="name">var <span class="ident">meta</span> : <a title="audiocraft.data.audio_dataset.AudioMeta" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.n_frames"><code class="name">var <span class="ident">n_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.seek_time"><code class="name">var <span class="ident">seek_time</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.total_frames"><code class="name">var <span class="ident">total_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes"><code class="name flex">
+<span>def <span class="ident">to_condition_attributes</span></span>(<span>self) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_condition_attributes(self) -&gt; ConditioningAttributes:
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner"><code class="flex name class">
+<span>class <span class="ident">T5Conditioner</span></span>
+<span>(</span><span>name: str, output_dim: int, finetune: bool, device: str, autocast_dtype: Optional[str] = 'float32', word_dropout: float = 0.0, normalize_text: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>T5-based TextConditioner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code></dt>
+<dd>Name of the T5 model.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+<dt><strong><code>finetune</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to fine-tune T5 at train time.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code></dt>
+<dd>Device for T5 Conditioner.</dd>
+<dt><strong><code>autocast_dtype</code></strong> :&ensp;<code>tp.Optional[str]</code>, optional</dt>
+<dd>Autocast dtype.</dd>
+<dt><strong><code>word_dropout</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Word dropout probability.</dd>
+<dt><strong><code>normalize_text</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to apply text normalization.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class T5Conditioner(TextConditioner):
+    &#34;&#34;&#34;T5-based TextConditioner.
+
+    Args:
+        name (str): Name of the T5 model.
+        output_dim (int): Output dim of the conditioner.
+        finetune (bool): Whether to fine-tune T5 at train time.
+        device (str): Device for T5 Conditioner.
+        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
+        word_dropout (float, optional): Word dropout probability.
+        normalize_text (bool, optional): Whether to apply text normalization.
+    &#34;&#34;&#34;
+    MODELS = [&#34;t5-small&#34;, &#34;t5-base&#34;, &#34;t5-large&#34;, &#34;t5-3b&#34;, &#34;t5-11b&#34;,
+              &#34;google/flan-t5-small&#34;, &#34;google/flan-t5-base&#34;, &#34;google/flan-t5-large&#34;,
+              &#34;google/flan-t5-xl&#34;, &#34;google/flan-t5-xxl&#34;]
+    MODELS_DIMS = {
+        &#34;t5-small&#34;: 512,
+        &#34;t5-base&#34;: 768,
+        &#34;t5-large&#34;: 1024,
+        &#34;t5-3b&#34;: 1024,
+        &#34;t5-11b&#34;: 1024,
+        &#34;google/flan-t5-small&#34;: 512,
+        &#34;google/flan-t5-base&#34;: 768,
+        &#34;google/flan-t5-large&#34;: 1024,
+        &#34;google/flan-t5-3b&#34;: 1024,
+        &#34;google/flan-t5-11b&#34;: 1024,
+    }
+
+    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, word_dropout: float = 0.,
+                 normalize_text: bool = False):
+        assert name in self.MODELS, f&#34;Unrecognized t5 model name (should in {self.MODELS})&#34;
+        super().__init__(self.MODELS_DIMS[name], output_dim)
+        self.device = device
+        self.name = name
+        self.finetune = finetune
+        self.word_dropout = word_dropout
+        if autocast_dtype is None or self.device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            if self.device != &#39;cpu&#39;:
+                logger.warning(&#34;T5 has no autocast, this might lead to NaN&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;T5 will be evaluated with autocast as {autocast_dtype}&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # Let&#39;s disable logging temporarily because T5 will vomit some errors otherwise.
+        # thanks https://gist.github.com/simon-weber/7853144
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter(&#34;ignore&#34;)
+            try:
+                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
+            finally:
+                logging.disable(previous_level)
+        if finetune:
+            self.t5 = t5
+        else:
+            # this makes sure that the t5 models is not part
+            # of the saved checkpoint
+            self.__dict__[&#39;t5&#39;] = t5.to(device)
+
+        self.normalize_text = normalize_text
+        if normalize_text:
+            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Dict[str, torch.Tensor]:
+        # if current sample doesn&#39;t have a certain attribute, replace with empty string
+        entries: tp.List[str] = [xi if xi is not None else &#34;&#34; for xi in x]
+        if self.normalize_text:
+            _, _, entries = self.text_normalizer(entries, return_text=True)
+        if self.word_dropout &gt; 0. and self.training:
+            new_entries = []
+            for entry in entries:
+                words = [word for word in entry.split(&#34; &#34;) if random.random() &gt;= self.word_dropout]
+                new_entries.append(&#34; &#34;.join(words))
+            entries = new_entries
+
+        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == &#34;&#34;])
+
+        inputs = self.t5_tokenizer(entries, return_tensors=&#39;pt&#39;, padding=True).to(self.device)
+        mask = inputs[&#39;attention_mask&#39;]
+        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+        return inputs
+
+    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -&gt; ConditionType:
+        mask = inputs[&#39;attention_mask&#39;]
+        with torch.set_grad_enabled(self.finetune), self.autocast:
+            embeds = self.t5(**inputs).last_hidden_state
+        embeds = self.output_proj(embeds.to(self.output_proj.weight))
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.MODELS"><code class="name">var <span class="ident">MODELS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS"><code class="name">var <span class="ident">MODELS_DIMS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner"><code class="flex name class">
+<span>class <span class="ident">TextConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base model for all conditioner modules.
+We allow the output dim to be different than the hidden dim for two reasons:
+1) keep our LUTs small when the vocab is large;
+2) make all condition dims consistent.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TextConditioner(BaseConditioner):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.LUTConditioner" href="#audiocraft.modules.conditioners.LUTConditioner">LUTConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.T5Conditioner" href="#audiocraft.modules.conditioners.T5Conditioner">T5Conditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.TextConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.Tokenizer"><code class="flex name class">
+<span>class <span class="ident">Tokenizer</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base tokenizer implementation
+(in case we want to introduce more advances tokenizers in the future).</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Tokenizer:
+    &#34;&#34;&#34;Base tokenizer implementation
+    (in case we want to introduce more advances tokenizers in the future).
+    &#34;&#34;&#34;
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.NoopTokenizer" href="#audiocraft.modules.conditioners.NoopTokenizer">NoopTokenizer</a></li>
+<li><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer">WhiteSpaceTokenizer</a></li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition"><code class="flex name class">
+<span>class <span class="ident">WavCondition</span></span>
+<span>(</span><span>wav: torch.Tensor, length: torch.Tensor, sample_rate: List[int], path: List[Optional[str]] = [], seek_time: List[Optional[float]] = [])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>WavCondition(wav, length, sample_rate, path, seek_time)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WavCondition(tp.NamedTuple):
+    wav: torch.Tensor
+    length: torch.Tensor
+    sample_rate: tp.List[int]
+    path: tp.List[tp.Optional[str]] = []
+    seek_time: tp.List[tp.Optional[float]] = []</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WavCondition.length"><code class="name">var <span class="ident">length</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.path"><code class="name">var <span class="ident">path</span> : List[Optional[str]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 3</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : List[int]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 2</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.seek_time"><code class="name">var <span class="ident">seek_time</span> : List[Optional[float]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 4</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.wav"><code class="name">var <span class="ident">wav</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner"><code class="flex name class">
+<span>class <span class="ident">WaveformConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int, device: Union[torch.device, str])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all conditioners that take a waveform as input.
+Classes that inherit must implement <code>_get_wav_embedding</code> that outputs
+a continuous tensor, and <code>_downsampling_factor</code> that returns the down-sampling
+factor of the embedding model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>The internal representation dimension.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code></dt>
+<dd>Device.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WaveformConditioner(BaseConditioner):
+    &#34;&#34;&#34;Base class for all conditioners that take a waveform as input.
+    Classes that inherit must implement `_get_wav_embedding` that outputs
+    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
+    factor of the embedding model.
+
+    Args:
+        dim (int): The internal representation dimension.
+        output_dim (int): Output dimension.
+        device (tp.Union[torch.device, str]): Device.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
+        super().__init__(dim, output_dim)
+        self.device = device
+        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
+        self._use_masking = True
+
+    def tokenize(self, x: WavCondition) -&gt; WavCondition:
+        wav, length, sample_rate, path, seek_time = x
+        assert length is not None
+        return WavCondition(wav.to(self.device), length.to(self.device), sample_rate, path, seek_time)
+
+    def _get_wav_embedding(self, x: WavCondition) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Gets as input a WavCondition and returns a dense embedding.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def _downsampling_factor(self):
+        &#34;&#34;&#34;Returns the downsampling factor of the embedding model.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, x: WavCondition) -&gt; ConditionType:
+        &#34;&#34;&#34;Extract condition embedding and mask from a waveform and its metadata.
+        Args:
+            x (WavCondition): Waveform condition containing raw waveform and metadata.
+        Returns:
+            ConditionType: a dense vector representing the conditioning along with its mask
+        &#34;&#34;&#34;
+        wav, lengths, *_ = x
+        with torch.no_grad():
+            embeds = self._get_wav_embedding(x)
+        embeds = embeds.to(self.output_proj.weight)
+        embeds = self.output_proj(embeds)
+
+        if lengths is not None and self._use_masking:
+            lengths = lengths / self._downsampling_factor()
+            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+        else:
+            mask = torch.ones_like(embeds[..., 0])
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.ChromaStemConditioner" href="#audiocraft.modules.conditioners.ChromaStemConditioner">ChromaStemConditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Extract condition embedding and mask from a waveform and its metadata.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></dt>
+<dd>Waveform condition containing raw waveform and metadata.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>ConditionType</code></dt>
+<dd>a dense vector representing the conditioning along with its mask</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: WavCondition) -&gt; ConditionType:
+    &#34;&#34;&#34;Extract condition embedding and mask from a waveform and its metadata.
+    Args:
+        x (WavCondition): Waveform condition containing raw waveform and metadata.
+    Returns:
+        ConditionType: a dense vector representing the conditioning along with its mask
+    &#34;&#34;&#34;
+    wav, lengths, *_ = x
+    with torch.no_grad():
+        embeds = self._get_wav_embedding(x)
+    embeds = embeds.to(self.output_proj.weight)
+    embeds = self.output_proj(embeds)
+
+    if lengths is not None and self._use_masking:
+        lengths = lengths / self._downsampling_factor()
+        mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+    else:
+        mask = torch.ones_like(embeds[..., 0])
+    embeds = (embeds * mask.unsqueeze(-1))
+    return embeds, mask</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.WhiteSpaceTokenizer"><code class="flex name class">
+<span>class <span class="ident">WhiteSpaceTokenizer</span></span>
+<span>(</span><span>n_bins: int, pad_idx: int = 0, language: str = 'en_core_web_sm', lemma: bool = True, stopwords: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>This tokenizer should be used for natural language descriptions.
+For example:
+["he didn't, know he's going home.", 'shorter sentence'] =&gt;
+[[78, 62, 31,
+4, 78, 25, 19, 34],
+[59, 77,
+0,
+0,
+0,
+0,
+0,
+0]]</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WhiteSpaceTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for natural language descriptions.
+    For example:
+    [&#34;he didn&#39;t, know he&#39;s going home.&#34;, &#39;shorter sentence&#39;] =&gt;
+    [[78, 62, 31,  4, 78, 25, 19, 34],
+    [59, 77,  0,  0,  0,  0,  0,  0]]
+    &#34;&#34;&#34;
+    PUNCTUATION = &#34;?:!.,;&#34;
+
+    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = &#34;en_core_web_sm&#34;,
+                 lemma: bool = True, stopwords: bool = True) -&gt; None:
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+        self.lemma = lemma
+        self.stopwords = stopwords
+        try:
+            self.nlp = spacy.load(language)
+        except IOError:
+            spacy.cli.download(language)  # type: ignore
+            self.nlp = spacy.load(language)
+
+    @tp.no_type_check
+    def __call__(self, texts: tp.List[tp.Optional[str]],
+                 return_text: bool = False) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Take a list of strings and convert them to a tensor of indices.
+
+        Args:
+            texts (list[str]): List of strings.
+            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]:
+                - Indices of words in the LUT.
+                - And a mask indicating where the padding tokens are
+        &#34;&#34;&#34;
+        output, lengths = [], []
+        texts = deepcopy(texts)
+        for i, text in enumerate(texts):
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(torch.Tensor([self.pad_idx]))
+                lengths.append(0)
+                continue
+
+            # convert numbers to words
+            text = re.sub(r&#34;(\d+)&#34;, lambda x: num2words(int(x.group(0))), text)  # type: ignore
+            # normalize text
+            text = self.nlp(text)  # type: ignore
+            # remove stopwords
+            if self.stopwords:
+                text = [w for w in text if not w.is_stop]  # type: ignore
+            # remove punctuation
+            text = [w for w in text if w.text not in self.PUNCTUATION]  # type: ignore
+            # lemmatize if needed
+            text = [getattr(t, &#34;lemma_&#34; if self.lemma else &#34;text&#34;) for t in text]  # type: ignore
+
+            texts[i] = &#34; &#34;.join(text)
+            lengths.append(len(text))
+            # convert to tensor
+            tokens = torch.Tensor([hash_trick(w, self.n_bins) for w in text])
+            output.append(tokens)
+
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
+        if return_text:
+            return padded_output, mask, texts  # type: ignore
+        return padded_output, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATION"><code class="name">var <span class="ident">PUNCTUATION</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.dropout_condition" href="#audiocraft.modules.conditioners.dropout_condition">dropout_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.nullify_condition" href="#audiocraft.modules.conditioners.nullify_condition">nullify_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.nullify_joint_embed" href="#audiocraft.modules.conditioners.nullify_joint_embed">nullify_joint_embed</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.nullify_wav" href="#audiocraft.modules.conditioners.nullify_wav">nullify_wav</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.AttributeDropout" href="#audiocraft.modules.conditioners.AttributeDropout">AttributeDropout</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.call_super_init" href="#audiocraft.modules.conditioners.AttributeDropout.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.dump_patches" href="#audiocraft.modules.conditioners.AttributeDropout.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.forward" href="#audiocraft.modules.conditioners.AttributeDropout.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.training" href="#audiocraft.modules.conditioners.AttributeDropout.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.call_super_init" href="#audiocraft.modules.conditioners.BaseConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.dump_patches" href="#audiocraft.modules.conditioners.BaseConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.training" href="#audiocraft.modules.conditioners.BaseConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.CLAPEmbeddingConditioner" href="#audiocraft.modules.conditioners.CLAPEmbeddingConditioner">CLAPEmbeddingConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.call_super_init" href="#audiocraft.modules.conditioners.CLAPEmbeddingConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.dump_patches" href="#audiocraft.modules.conditioners.CLAPEmbeddingConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.CLAPEmbeddingConditioner.training" href="#audiocraft.modules.conditioners.CLAPEmbeddingConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner" href="#audiocraft.modules.conditioners.ChromaStemConditioner">ChromaStemConditioner</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init" href="#audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches" href="#audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.has_eval_wavs" href="#audiocraft.modules.conditioners.ChromaStemConditioner.has_eval_wavs">has_eval_wavs</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.reset_eval_wavs" href="#audiocraft.modules.conditioners.ChromaStemConditioner.reset_eval_wavs">reset_eval_wavs</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.tokenize" href="#audiocraft.modules.conditioners.ChromaStemConditioner.tokenize">tokenize</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.training" href="#audiocraft.modules.conditioners.ChromaStemConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout">ClassifierFreeGuidanceDropout</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditionFuser" href="#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS" href="#audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS">FUSING_METHODS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.call_super_init" href="#audiocraft.modules.conditioners.ConditionFuser.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.dump_patches" href="#audiocraft.modules.conditioners.ConditionFuser.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.forward" href="#audiocraft.modules.conditioners.ConditionFuser.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.training" href="#audiocraft.modules.conditioners.ConditionFuser.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.attributes">attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict" href="#audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict">from_flat_dict</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.joint_embed" href="#audiocraft.modules.conditioners.ConditioningAttributes.joint_embed">joint_embed</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.joint_embed_attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.joint_embed_attributes">joint_embed_attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.text" href="#audiocraft.modules.conditioners.ConditioningAttributes.text">text</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.text_attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.text_attributes">text_attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict" href="#audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict">to_flat_dict</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.wav" href="#audiocraft.modules.conditioners.ConditioningAttributes.wav">wav</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes">wav_attributes</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditioningProvider" href="#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.call_super_init" href="#audiocraft.modules.conditioners.ConditioningProvider.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.dump_patches" href="#audiocraft.modules.conditioners.ConditioningProvider.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.forward" href="#audiocraft.modules.conditioners.ConditioningProvider.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.has_joint_embed_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.has_joint_embed_conditions">has_joint_embed_conditions</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition" href="#audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition">has_wav_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.joint_embed_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.joint_embed_conditions">joint_embed_conditions</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.text_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.text_conditions">text_conditions</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.tokenize" href="#audiocraft.modules.conditioners.ConditioningProvider.tokenize">tokenize</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.training" href="#audiocraft.modules.conditioners.ConditioningProvider.training">training</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.wav_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.wav_conditions">wav_conditions</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.call_super_init" href="#audiocraft.modules.conditioners.DropoutModule.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.dump_patches" href="#audiocraft.modules.conditioners.DropoutModule.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.forward" href="#audiocraft.modules.conditioners.DropoutModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.training" href="#audiocraft.modules.conditioners.DropoutModule.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.JointEmbedCondition" href="#audiocraft.modules.conditioners.JointEmbedCondition">JointEmbedCondition</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.length" href="#audiocraft.modules.conditioners.JointEmbedCondition.length">length</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.path" href="#audiocraft.modules.conditioners.JointEmbedCondition.path">path</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.sample_rate" href="#audiocraft.modules.conditioners.JointEmbedCondition.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.seek_time" href="#audiocraft.modules.conditioners.JointEmbedCondition.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.text" href="#audiocraft.modules.conditioners.JointEmbedCondition.text">text</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbedCondition.wav" href="#audiocraft.modules.conditioners.JointEmbedCondition.wav">wav</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner">JointEmbeddingConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner.call_super_init" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner.dump_patches" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.JointEmbeddingConditioner.training" href="#audiocraft.modules.conditioners.JointEmbeddingConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.LUTConditioner" href="#audiocraft.modules.conditioners.LUTConditioner">LUTConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.call_super_init" href="#audiocraft.modules.conditioners.LUTConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.dump_patches" href="#audiocraft.modules.conditioners.LUTConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.training" href="#audiocraft.modules.conditioners.LUTConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.NoopTokenizer" href="#audiocraft.modules.conditioners.NoopTokenizer">NoopTokenizer</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.channels" href="#audiocraft.modules.conditioners.SegmentWithAttributes.channels">channels</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.meta" href="#audiocraft.modules.conditioners.SegmentWithAttributes.meta">meta</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.n_frames" href="#audiocraft.modules.conditioners.SegmentWithAttributes.n_frames">n_frames</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate" href="#audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.seek_time" href="#audiocraft.modules.conditioners.SegmentWithAttributes.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes" href="#audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes">to_condition_attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.total_frames" href="#audiocraft.modules.conditioners.SegmentWithAttributes.total_frames">total_frames</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.T5Conditioner" href="#audiocraft.modules.conditioners.T5Conditioner">T5Conditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.MODELS" href="#audiocraft.modules.conditioners.T5Conditioner.MODELS">MODELS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS" href="#audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS">MODELS_DIMS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.call_super_init" href="#audiocraft.modules.conditioners.T5Conditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.dump_patches" href="#audiocraft.modules.conditioners.T5Conditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.training" href="#audiocraft.modules.conditioners.T5Conditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.call_super_init" href="#audiocraft.modules.conditioners.TextConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.dump_patches" href="#audiocraft.modules.conditioners.TextConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.training" href="#audiocraft.modules.conditioners.TextConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.length" href="#audiocraft.modules.conditioners.WavCondition.length">length</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.path" href="#audiocraft.modules.conditioners.WavCondition.path">path</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.sample_rate" href="#audiocraft.modules.conditioners.WavCondition.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.seek_time" href="#audiocraft.modules.conditioners.WavCondition.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.wav" href="#audiocraft.modules.conditioners.WavCondition.wav">wav</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.call_super_init" href="#audiocraft.modules.conditioners.WaveformConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.dump_patches" href="#audiocraft.modules.conditioners.WaveformConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.forward" href="#audiocraft.modules.conditioners.WaveformConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.training" href="#audiocraft.modules.conditioners.WaveformConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer">WhiteSpaceTokenizer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATION" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATION">PUNCTUATION</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/conv.html b/api_docs/audiocraft/modules/conv.html
new file mode 100644
index 00000000..98c6c6a8
--- /dev/null
+++ b/api_docs/audiocraft/modules/conv.html
@@ -0,0 +1,1044 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.conv API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.conv</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import typing as tp
+import warnings
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm, weight_norm
+
+
+CONV_NORMALIZATIONS = frozenset([&#39;none&#39;, &#39;weight_norm&#39;, &#39;spectral_norm&#39;,
+                                 &#39;time_group_norm&#39;])
+
+
+def apply_parametrization_norm(module: nn.Module, norm: str = &#39;none&#39;):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;weight_norm&#39;:
+        return weight_norm(module)
+    elif norm == &#39;spectral_norm&#39;:
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn&#39;t need reparametrization.
+        return module
+
+
+def get_norm_module(module: nn.Module, causal: bool = False, norm: str = &#39;none&#39;, **norm_kwargs):
+    &#34;&#34;&#34;Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn&#39;t support causal evaluation.
+    &#34;&#34;&#34;
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;time_group_norm&#39;:
+        if causal:
+            raise ValueError(&#34;GroupNorm doesn&#39;t support causal evaluation.&#34;)
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+
+
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                 padding_total: int = 0) -&gt; int:
+    &#34;&#34;&#34;See `pad_for_conv1d`.&#34;&#34;&#34;
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+
+
+def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
+    &#34;&#34;&#34;Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    &#34;&#34;&#34;
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+
+
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = &#39;constant&#39;, value: float = 0.):
+    &#34;&#34;&#34;Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    if mode == &#39;reflect&#39;:
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length &lt;= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+
+
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    &#34;&#34;&#34;Remove padding from x, handling properly zero padding. Only for 1d!&#34;&#34;&#34;
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) &lt;= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]
+
+
+class NormConv1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConv2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConvTranspose2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+
+
+class StreamableConv1d(nn.Module):
+    &#34;&#34;&#34;Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, dilation: int = 1,
+                 groups: int = 1, bias: bool = True, causal: bool = False,
+                 norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {},
+                 pad_mode: str = &#39;reflect&#39;):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride &gt; 1 and dilation &gt; 1:
+            warnings.warn(&#34;StreamableConv1d has been initialized with stride &gt; 1 and dilation &gt; 1&#34;
+                          f&#34; (kernel_size={kernel_size} stride={stride}, dilation={dilation}).&#34;)
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                               dilation=dilation, groups=groups, bias=bias, causal=causal,
+                               norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        return self.conv(x)
+
+
+class StreamableConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, causal: bool = False,
+                 norm: str = &#39;none&#39;, trim_right_ratio: float = 1.,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            &#34;`trim_right_ratio` != 1.0 only makes sense for causal convolutions&#34;
+        assert self.trim_right_ratio &gt;= 0. and self.trim_right_ratio &lt;= 1.
+
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+
+        y = self.convtr(x)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.conv.apply_parametrization_norm"><code class="name flex">
+<span>def <span class="ident">apply_parametrization_norm</span></span>(<span>module: torch.nn.modules.module.Module, norm: str = 'none')</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def apply_parametrization_norm(module: nn.Module, norm: str = &#39;none&#39;):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;weight_norm&#39;:
+        return weight_norm(module)
+    elif norm == &#39;spectral_norm&#39;:
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn&#39;t need reparametrization.
+        return module</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.get_extra_padding_for_conv1d"><code class="name flex">
+<span>def <span class="ident">get_extra_padding_for_conv1d</span></span>(<span>x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.modules.conv.pad_for_conv1d" href="#audiocraft.modules.conv.pad_for_conv1d">pad_for_conv1d()</a></code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                 padding_total: int = 0) -&gt; int:
+    &#34;&#34;&#34;See `pad_for_conv1d`.&#34;&#34;&#34;
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.get_norm_module"><code class="name flex">
+<span>def <span class="ident">get_norm_module</span></span>(<span>module: torch.nn.modules.module.Module, causal: bool = False, norm: str = 'none', **norm_kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return the proper normalization module. If causal is True, this will ensure the returned
+module is causal, or return an error if the normalization doesn't support causal evaluation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_norm_module(module: nn.Module, causal: bool = False, norm: str = &#39;none&#39;, **norm_kwargs):
+    &#34;&#34;&#34;Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn&#39;t support causal evaluation.
+    &#34;&#34;&#34;
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;time_group_norm&#39;:
+        if causal:
+            raise ValueError(&#34;GroupNorm doesn&#39;t support causal evaluation.&#34;)
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.pad1d"><code class="name flex">
+<span>def <span class="ident">pad1d</span></span>(<span>x: torch.Tensor, paddings: Tuple[int, int], mode: str = 'constant', value: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+If this is the case, we insert extra 0 padding to the right before the reflection happen.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = &#39;constant&#39;, value: float = 0.):
+    &#34;&#34;&#34;Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    if mode == &#39;reflect&#39;:
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length &lt;= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.pad_for_conv1d"><code class="name flex">
+<span>def <span class="ident">pad_for_conv1d</span></span>(<span>x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Pad for a convolution to make sure that the last window is full.
+Extra padding is added at the end. This is required to ensure that we can rebuild
+an output of the same length, as otherwise, even with padding, some time steps
+might get removed.
+For instance, with total padding = 4, kernel size = 4, stride = 2:
+0 0 1 2 3 4 5 0 0
+# (0s are padding)
+1
+2
+3
+# (output frames of a convolution, last 0 is never used)
+0 0 1 2 3 4 5 0
+# (output of tr. conv., but pos. 5 is going to get removed as padding)
+1 2 3 4
+# once you removed padding, we are missing one time step !</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
+    &#34;&#34;&#34;Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    &#34;&#34;&#34;
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.unpad1d"><code class="name flex">
+<span>def <span class="ident">unpad1d</span></span>(<span>x: torch.Tensor, paddings: Tuple[int, int])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Remove padding from x, handling properly zero padding. Only for 1d!</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    &#34;&#34;&#34;Remove padding from x, handling properly zero padding. Only for 1d!&#34;&#34;&#34;
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) &lt;= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d"><code class="flex name class">
+<span>class <span class="ident">NormConv1d</span></span>
+<span>(</span><span>*args, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around Conv1d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConv1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.conv(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d"><code class="flex name class">
+<span>class <span class="ident">NormConv2d</span></span>
+<span>(</span><span>*args, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around Conv2d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConv2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv2d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv2d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.conv(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d"><code class="flex name class">
+<span>class <span class="ident">NormConvTranspose1d</span></span>
+<span>(</span><span>*args, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around ConvTranspose1d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.convtr(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d"><code class="flex name class">
+<span>class <span class="ident">NormConvTranspose2d</span></span>
+<span>(</span><span>*args, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around ConvTranspose2d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConvTranspose2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.convtr(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d"><code class="flex name class">
+<span>class <span class="ident">StreamableConv1d</span></span>
+<span>(</span><span>in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1, groups: int = 1, bias: bool = True, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, pad_mode: str = 'reflect')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Conv1d with some builtin handling of asymmetric or causal padding
+and normalization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableConv1d(nn.Module):
+    &#34;&#34;&#34;Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, dilation: int = 1,
+                 groups: int = 1, bias: bool = True, causal: bool = False,
+                 norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {},
+                 pad_mode: str = &#39;reflect&#39;):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride &gt; 1 and dilation &gt; 1:
+            warnings.warn(&#34;StreamableConv1d has been initialized with stride &gt; 1 and dilation &gt; 1&#34;
+                          f&#34; (kernel_size={kernel_size} stride={stride}, dilation={dilation}).&#34;)
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                               dilation=dilation, groups=groups, bias=bias, causal=causal,
+                               norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        return self.conv(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConv1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConv1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    B, C, T = x.shape
+    kernel_size = self.conv.conv.kernel_size[0]
+    stride = self.conv.conv.stride[0]
+    dilation = self.conv.conv.dilation[0]
+    kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+    padding_total = kernel_size - stride
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    if self.causal:
+        # Left padding for causal
+        x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+    else:
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+    return self.conv(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d"><code class="flex name class">
+<span>class <span class="ident">StreamableConvTranspose1d</span></span>
+<span>(</span><span>in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, causal: bool = False, norm: str = 'none', trim_right_ratio: float = 1.0, norm_kwargs: Dict[str, Any] = {})</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ConvTranspose1d with some builtin handling of asymmetric or causal padding
+and normalization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, causal: bool = False,
+                 norm: str = &#39;none&#39;, trim_right_ratio: float = 1.,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            &#34;`trim_right_ratio` != 1.0 only makes sense for causal convolutions&#34;
+        assert self.trim_right_ratio &gt;= 0. and self.trim_right_ratio &lt;= 1.
+
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+
+        y = self.convtr(x)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    kernel_size = self.convtr.convtr.kernel_size[0]
+    stride = self.convtr.convtr.stride[0]
+    padding_total = kernel_size - stride
+
+    y = self.convtr(x)
+
+    # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+    # removed at the very end, when keeping only the right length for the output,
+    # as removing it here would require also passing the length at the matching layer
+    # in the encoder.
+    if self.causal:
+        # Trim the padding on the right according to the specified ratio
+        # if trim_right_ratio = 1.0, trim everything from right
+        padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        padding_left = padding_total - padding_right
+        y = unpad1d(y, (padding_left, padding_right))
+    else:
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        y = unpad1d(y, (padding_left, padding_right))
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.apply_parametrization_norm" href="#audiocraft.modules.conv.apply_parametrization_norm">apply_parametrization_norm</a></code></li>
+<li><code><a title="audiocraft.modules.conv.get_extra_padding_for_conv1d" href="#audiocraft.modules.conv.get_extra_padding_for_conv1d">get_extra_padding_for_conv1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.get_norm_module" href="#audiocraft.modules.conv.get_norm_module">get_norm_module</a></code></li>
+<li><code><a title="audiocraft.modules.conv.pad1d" href="#audiocraft.modules.conv.pad1d">pad1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.pad_for_conv1d" href="#audiocraft.modules.conv.pad_for_conv1d">pad_for_conv1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.unpad1d" href="#audiocraft.modules.conv.unpad1d">unpad1d</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConv1d" href="#audiocraft.modules.conv.NormConv1d">NormConv1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConv1d.call_super_init" href="#audiocraft.modules.conv.NormConv1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.dump_patches" href="#audiocraft.modules.conv.NormConv1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.forward" href="#audiocraft.modules.conv.NormConv1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.training" href="#audiocraft.modules.conv.NormConv1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConv2d" href="#audiocraft.modules.conv.NormConv2d">NormConv2d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConv2d.call_super_init" href="#audiocraft.modules.conv.NormConv2d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.dump_patches" href="#audiocraft.modules.conv.NormConv2d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.forward" href="#audiocraft.modules.conv.NormConv2d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.training" href="#audiocraft.modules.conv.NormConv2d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConvTranspose1d" href="#audiocraft.modules.conv.NormConvTranspose1d">NormConvTranspose1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.call_super_init" href="#audiocraft.modules.conv.NormConvTranspose1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.dump_patches" href="#audiocraft.modules.conv.NormConvTranspose1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.forward" href="#audiocraft.modules.conv.NormConvTranspose1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.training" href="#audiocraft.modules.conv.NormConvTranspose1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConvTranspose2d" href="#audiocraft.modules.conv.NormConvTranspose2d">NormConvTranspose2d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.call_super_init" href="#audiocraft.modules.conv.NormConvTranspose2d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.dump_patches" href="#audiocraft.modules.conv.NormConvTranspose2d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.forward" href="#audiocraft.modules.conv.NormConvTranspose2d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.training" href="#audiocraft.modules.conv.NormConvTranspose2d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.StreamableConv1d" href="#audiocraft.modules.conv.StreamableConv1d">StreamableConv1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.call_super_init" href="#audiocraft.modules.conv.StreamableConv1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.dump_patches" href="#audiocraft.modules.conv.StreamableConv1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.forward" href="#audiocraft.modules.conv.StreamableConv1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.training" href="#audiocraft.modules.conv.StreamableConv1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d" href="#audiocraft.modules.conv.StreamableConvTranspose1d">StreamableConvTranspose1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init" href="#audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches" href="#audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.forward" href="#audiocraft.modules.conv.StreamableConvTranspose1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.training" href="#audiocraft.modules.conv.StreamableConvTranspose1d.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/diffusion_schedule.html b/api_docs/audiocraft/modules/diffusion_schedule.html
new file mode 100644
index 00000000..29e46666
--- /dev/null
+++ b/api_docs/audiocraft/modules/diffusion_schedule.html
@@ -0,0 +1,1145 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.diffusion_schedule API documentation</title>
+<meta name="description" content="Functions for Noise Schedule, defines diffusion process, reverse process and data processor." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.diffusion_schedule</code></h1>
+</header>
+<section id="section-intro">
+<p>Functions for Noise Schedule, defines diffusion process, reverse process and data processor.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Functions for Noise Schedule, defines diffusion process, reverse process and data processor.
+&#34;&#34;&#34;
+
+from collections import namedtuple
+import random
+import typing as tp
+import julius
+import torch
+
+TrainingItem = namedtuple(&#34;TrainingItem&#34;, &#34;noisy noise step&#34;)
+
+
+def betas_from_alpha_bar(alpha_bar):
+    alphas = torch.cat([torch.Tensor([alpha_bar[0]]), alpha_bar[1:]/alpha_bar[:-1]])
+    return 1 - alphas
+
+
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        &#34;&#34;&#34;Project the original sample to the &#39;space&#39; where the diffusion will happen.&#34;&#34;&#34;
+        return x
+
+    def return_sample(self, z: torch.Tensor):
+        &#34;&#34;&#34;Project back from diffusion space to the actual sample space.&#34;&#34;&#34;
+        return z
+
+
+class MultiBandProcessor(SampleProcessor):
+    &#34;&#34;&#34;
+    MultiBand sample processor. The input audio is splitted across
+    frequency bands evenly distributed in mel-scale.
+
+    Each band will be rescaled to match the power distribution
+    of Gaussian noise in that band, using online metrics
+    computed on the first few samples.
+
+    Args:
+        n_bands (int): Number of mel-bands to split the signal over.
+        sample_rate (int): Sample rate of the audio.
+        num_samples (int): Number of samples to use to fit the rescaling
+            for each band. The processor won&#39;t be stable
+            until it has seen that many samples.
+        power_std (float or list/tensor): The rescaling factor computed to match the
+            power of Gaussian noise in each band is taken to
+            that power, i.e. `1.` means full correction of the energy
+            in each band, and values less than `1` means only partial
+            correction. Can be used to balance the relative importance
+            of low vs. high freq in typical audio signals.
+    &#34;&#34;&#34;
+    def __init__(self, n_bands: int = 8, sample_rate: float = 24_000,
+                 num_samples: int = 10_000, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1.):
+        super().__init__()
+        self.n_bands = n_bands
+        self.split_bands = julius.SplitBands(sample_rate, n_bands=n_bands)
+        self.num_samples = num_samples
+        self.power_std = power_std
+        if isinstance(power_std, list):
+            assert len(power_std) == n_bands
+            power_std = torch.tensor(power_std)
+        self.register_buffer(&#39;counts&#39;, torch.zeros(1))
+        self.register_buffer(&#39;sum_x&#39;, torch.zeros(n_bands))
+        self.register_buffer(&#39;sum_x2&#39;, torch.zeros(n_bands))
+        self.register_buffer(&#39;sum_target_x2&#39;, torch.zeros(n_bands))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+        self.sum_target_x2: torch.Tensor
+
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        return mean
+
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        return std
+
+    @property
+    def target_std(self):
+        target_std = self.sum_target_x2 / self.counts
+        return target_std
+
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        bands = self.split_bands(x)
+        if self.counts.item() &lt; self.num_samples:
+            ref_bands = self.split_bands(torch.randn_like(x))
+            self.counts += len(x)
+            self.sum_x += bands.mean(dim=(2, 3)).sum(dim=1)
+            self.sum_x2 += bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
+            self.sum_target_x2 += ref_bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        bands = (bands - self.mean.view(-1, 1, 1, 1)) * rescale.view(-1, 1, 1, 1)
+        return bands.sum(dim=0)
+
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        bands = self.split_bands(x)
+        rescale = (self.std / self.target_std) ** self.power_std
+        bands = bands * rescale.view(-1, 1, 1, 1) + self.mean.view(-1, 1, 1, 1)
+        return bands.sum(dim=0)
+
+
+class NoiseSchedule:
+    &#34;&#34;&#34;Noise schedule for diffusion.
+
+    Args:
+        beta_t0 (float): Variance of the first diffusion step.
+        beta_t1 (float): Variance of the last diffusion step.
+        beta_exp (float): Power schedule exponent
+        num_steps (int): Number of diffusion step.
+        variance (str): choice of the sigma value for the denoising eq. Choices: &#34;beta&#34; or &#34;beta_tilde&#34;
+        clip (float): clipping value for the denoising steps
+        rescale (float): rescaling value to avoid vanishing signals unused by default (i.e 1)
+        repartition (str): shape of the schedule only power schedule is supported
+        sample_processor (SampleProcessor): Module that normalize data to match better the gaussian distribution
+        noise_scale (float): Scaling factor for the noise
+    &#34;&#34;&#34;
+    def __init__(self, beta_t0: float = 1e-4, beta_t1: float = 0.02, num_steps: int = 1000, variance: str = &#39;beta&#39;,
+                 clip: float = 5., rescale: float = 1., device=&#39;cuda&#39;, beta_exp: float = 1,
+                 repartition: str = &#34;power&#34;, alpha_sigmoid: dict = {}, n_bands: tp.Optional[int] = None,
+                 sample_processor: SampleProcessor = SampleProcessor(), noise_scale: float = 1.0, **kwargs):
+
+        self.beta_t0 = beta_t0
+        self.beta_t1 = beta_t1
+        self.variance = variance
+        self.num_steps = num_steps
+        self.clip = clip
+        self.sample_processor = sample_processor
+        self.rescale = rescale
+        self.n_bands = n_bands
+        self.noise_scale = noise_scale
+        assert n_bands is None
+        if repartition == &#34;power&#34;:
+            self.betas = torch.linspace(beta_t0 ** (1 / beta_exp), beta_t1 ** (1 / beta_exp), num_steps,
+                                        device=device, dtype=torch.float) ** beta_exp
+        else:
+            raise RuntimeError(&#39;Not implemented&#39;)
+        self.rng = random.Random(1234)
+
+    def get_beta(self, step: tp.Union[int, torch.Tensor]):
+        if self.n_bands is None:
+            return self.betas[step]
+        else:
+            return self.betas[:, step]  # [n_bands, len(step)]
+
+    def get_initial_noise(self, x: torch.Tensor):
+        if self.n_bands is None:
+            return torch.randn_like(x)
+        return torch.randn((x.size(0), self.n_bands, x.size(2)))
+
+    def get_alpha_bar(self, step: tp.Optional[tp.Union[int, torch.Tensor]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Return &#39;alpha_bar&#39;, either for a given step, or as a tensor with its value for each step.&#34;&#34;&#34;
+        if step is None:
+            return (1 - self.betas).cumprod(dim=-1)  # works for simgle and multi bands
+        if type(step) is int:
+            return (1 - self.betas[:step + 1]).prod()
+        else:
+            return (1 - self.betas).cumprod(dim=0)[step].view(-1, 1, 1)
+
+    def get_training_item(self, x: torch.Tensor, tensor_step: bool = False) -&gt; TrainingItem:
+        &#34;&#34;&#34;Create a noisy data item for diffusion model training:
+
+        Args:
+            x (torch.Tensor): clean audio data torch.tensor(bs, 1, T)
+            tensor_step (bool): If tensor_step = false, only one step t is sample,
+                the whole batch is diffused to the same step and t is int.
+                If tensor_step = true, t is a tensor of size (x.size(0),)
+                every element of the batch is diffused to a independently sampled.
+        &#34;&#34;&#34;
+        step: tp.Union[int, torch.Tensor]
+        if tensor_step:
+            bs = x.size(0)
+            step = torch.randint(0, self.num_steps, size=(bs,), device=x.device)
+        else:
+            step = self.rng.randrange(self.num_steps)
+        alpha_bar = self.get_alpha_bar(step)  # [batch_size, n_bands, 1]
+
+        x = self.sample_processor.project_sample(x)
+        noise = torch.randn_like(x)
+        noisy = (alpha_bar.sqrt() / self.rescale) * x + (1 - alpha_bar).sqrt() * noise * self.noise_scale
+        return TrainingItem(noisy, noise, step)
+
+    def generate(self, model: torch.nn.Module, initial: tp.Optional[torch.Tensor] = None,
+                 condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+        &#34;&#34;&#34;Full ddpm reverse process.
+
+        Args:
+            model (nn.Module): Diffusion model.
+            initial (tensor): Initial Noise.
+            condition (tensor): Input conditionning Tensor (e.g. encodec compressed representation).
+            return_list (bool): Whether to return the whole process or only the sampled point.
+        &#34;&#34;&#34;
+        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+        current = initial
+        iterates = [initial]
+        for step in range(self.num_steps)[::-1]:
+            with torch.no_grad():
+                estimate = model(current, step, condition=condition).sample
+            alpha = 1 - self.betas[step]
+            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+            previous_alpha_bar = self.get_alpha_bar(step=step - 1)
+            if step == 0:
+                sigma2 = 0
+            elif self.variance == &#39;beta&#39;:
+                sigma2 = 1 - alpha
+            elif self.variance == &#39;beta_tilde&#39;:
+                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+            elif self.variance == &#39;none&#39;:
+                sigma2 = 0
+            else:
+                raise ValueError(f&#39;Invalid variance type {self.variance}&#39;)
+
+            if sigma2 &gt; 0:
+                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+            if self.clip:
+                previous = previous.clamp(-self.clip, self.clip)
+            current = previous
+            alpha_bar = previous_alpha_bar
+            if step == 0:
+                previous *= self.rescale
+            if return_list:
+                iterates.append(previous.cpu())
+
+        if return_list:
+            return iterates
+        else:
+            return self.sample_processor.return_sample(previous)
+
+    def generate_subsampled(self, model: torch.nn.Module, initial: torch.Tensor, step_list: tp.Optional[list] = None,
+                            condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+        &#34;&#34;&#34;Reverse process that only goes through Markov chain states in step_list.&#34;&#34;&#34;
+        if step_list is None:
+            step_list = list(range(1000))[::-50] + [0]
+        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+        alpha_bars_subsampled = (1 - self.betas).cumprod(dim=0)[list(reversed(step_list))].cpu()
+        betas_subsampled = betas_from_alpha_bar(alpha_bars_subsampled)
+        current = initial * self.noise_scale
+        iterates = [current]
+        for idx, step in enumerate(step_list[:-1]):
+            with torch.no_grad():
+                estimate = model(current, step, condition=condition).sample * self.noise_scale
+            alpha = 1 - betas_subsampled[-1 - idx]
+            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+            previous_alpha_bar = self.get_alpha_bar(step_list[idx + 1])
+            if step == step_list[-2]:
+                sigma2 = 0
+                previous_alpha_bar = torch.tensor(1.0)
+            else:
+                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+            if sigma2 &gt; 0:
+                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+            if self.clip:
+                previous = previous.clamp(-self.clip, self.clip)
+            current = previous
+            alpha_bar = previous_alpha_bar
+            if step == 0:
+                previous *= self.rescale
+            if return_list:
+                iterates.append(previous.cpu())
+        if return_list:
+            return iterates
+        else:
+            return self.sample_processor.return_sample(previous)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.betas_from_alpha_bar"><code class="name flex">
+<span>def <span class="ident">betas_from_alpha_bar</span></span>(<span>alpha_bar)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def betas_from_alpha_bar(alpha_bar):
+    alphas = torch.cat([torch.Tensor([alpha_bar[0]]), alpha_bar[1:]/alpha_bar[:-1]])
+    return 1 - alphas</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor"><code class="flex name class">
+<span>class <span class="ident">MultiBandProcessor</span></span>
+<span>(</span><span>n_bands: int = 8, sample_rate: float = 24000, num_samples: int = 10000, power_std: Union[float, List[float], torch.Tensor] = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>MultiBand sample processor. The input audio is splitted across
+frequency bands evenly distributed in mel-scale.</p>
+<p>Each band will be rescaled to match the power distribution
+of Gaussian noise in that band, using online metrics
+computed on the first few samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of mel-bands to split the signal over.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate of the audio.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of samples to use to fit the rescaling
+for each band. The processor won't be stable
+until it has seen that many samples.</dd>
+</dl>
+<p>power_std (float or list/tensor): The rescaling factor computed to match the
+power of Gaussian noise in each band is taken to
+that power, i.e. <code>1.</code> means full correction of the energy
+in each band, and values less than <code>1</code> means only partial
+correction. Can be used to balance the relative importance
+of low vs. high freq in typical audio signals.
+Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MultiBandProcessor(SampleProcessor):
+    &#34;&#34;&#34;
+    MultiBand sample processor. The input audio is splitted across
+    frequency bands evenly distributed in mel-scale.
+
+    Each band will be rescaled to match the power distribution
+    of Gaussian noise in that band, using online metrics
+    computed on the first few samples.
+
+    Args:
+        n_bands (int): Number of mel-bands to split the signal over.
+        sample_rate (int): Sample rate of the audio.
+        num_samples (int): Number of samples to use to fit the rescaling
+            for each band. The processor won&#39;t be stable
+            until it has seen that many samples.
+        power_std (float or list/tensor): The rescaling factor computed to match the
+            power of Gaussian noise in each band is taken to
+            that power, i.e. `1.` means full correction of the energy
+            in each band, and values less than `1` means only partial
+            correction. Can be used to balance the relative importance
+            of low vs. high freq in typical audio signals.
+    &#34;&#34;&#34;
+    def __init__(self, n_bands: int = 8, sample_rate: float = 24_000,
+                 num_samples: int = 10_000, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1.):
+        super().__init__()
+        self.n_bands = n_bands
+        self.split_bands = julius.SplitBands(sample_rate, n_bands=n_bands)
+        self.num_samples = num_samples
+        self.power_std = power_std
+        if isinstance(power_std, list):
+            assert len(power_std) == n_bands
+            power_std = torch.tensor(power_std)
+        self.register_buffer(&#39;counts&#39;, torch.zeros(1))
+        self.register_buffer(&#39;sum_x&#39;, torch.zeros(n_bands))
+        self.register_buffer(&#39;sum_x2&#39;, torch.zeros(n_bands))
+        self.register_buffer(&#39;sum_target_x2&#39;, torch.zeros(n_bands))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+        self.sum_target_x2: torch.Tensor
+
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        return mean
+
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        return std
+
+    @property
+    def target_std(self):
+        target_std = self.sum_target_x2 / self.counts
+        return target_std
+
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        bands = self.split_bands(x)
+        if self.counts.item() &lt; self.num_samples:
+            ref_bands = self.split_bands(torch.randn_like(x))
+            self.counts += len(x)
+            self.sum_x += bands.mean(dim=(2, 3)).sum(dim=1)
+            self.sum_x2 += bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
+            self.sum_target_x2 += ref_bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        bands = (bands - self.mean.view(-1, 1, 1, 1)) * rescale.view(-1, 1, 1, 1)
+        return bands.sum(dim=0)
+
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 3
+        bands = self.split_bands(x)
+        rescale = (self.std / self.target_std) ** self.power_std
+        bands = bands * rescale.view(-1, 1, 1, 1) + self.mean.view(-1, 1, 1, 1)
+        return bands.sum(dim=0)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.diffusion_schedule.SampleProcessor" href="#audiocraft.modules.diffusion_schedule.SampleProcessor">SampleProcessor</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.mean"><code class="name">var <span class="ident">mean</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def mean(self):
+    mean = self.sum_x / self.counts
+    return mean</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.std"><code class="name">var <span class="ident">std</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def std(self):
+    std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+    return std</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.MultiBandProcessor.target_std"><code class="name">var <span class="ident">target_std</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def target_std(self):
+    target_std = self.sum_target_x2 / self.counts
+    return target_std</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.diffusion_schedule.SampleProcessor" href="#audiocraft.modules.diffusion_schedule.SampleProcessor">SampleProcessor</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.forward" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.project_sample" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.project_sample">project_sample</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.return_sample" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.return_sample">return_sample</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule"><code class="flex name class">
+<span>class <span class="ident">NoiseSchedule</span></span>
+<span>(</span><span>beta_t0: float = 0.0001, beta_t1: float = 0.02, num_steps: int = 1000, variance: str = 'beta', clip: float = 5.0, rescale: float = 1.0, device='cuda', beta_exp: float = 1, repartition: str = 'power', alpha_sigmoid: dict = {}, n_bands: Optional[int] = None, sample_processor: <a title="audiocraft.modules.diffusion_schedule.SampleProcessor" href="#audiocraft.modules.diffusion_schedule.SampleProcessor">SampleProcessor</a> = SampleProcessor(), noise_scale: float = 1.0, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Noise schedule for diffusion.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>beta_t0</code></strong> :&ensp;<code>float</code></dt>
+<dd>Variance of the first diffusion step.</dd>
+<dt><strong><code>beta_t1</code></strong> :&ensp;<code>float</code></dt>
+<dd>Variance of the last diffusion step.</dd>
+<dt><strong><code>beta_exp</code></strong> :&ensp;<code>float</code></dt>
+<dd>Power schedule exponent</dd>
+<dt><strong><code>num_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of diffusion step.</dd>
+<dt><strong><code>variance</code></strong> :&ensp;<code>str</code></dt>
+<dd>choice of the sigma value for the denoising eq. Choices: "beta" or "beta_tilde"</dd>
+<dt><strong><code>clip</code></strong> :&ensp;<code>float</code></dt>
+<dd>clipping value for the denoising steps</dd>
+<dt><strong><code>rescale</code></strong> :&ensp;<code>float</code></dt>
+<dd>rescaling value to avoid vanishing signals unused by default (i.e 1)</dd>
+<dt><strong><code>repartition</code></strong> :&ensp;<code>str</code></dt>
+<dd>shape of the schedule only power schedule is supported</dd>
+<dt><strong><code>sample_processor</code></strong> :&ensp;<code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor" href="#audiocraft.modules.diffusion_schedule.SampleProcessor">SampleProcessor</a></code></dt>
+<dd>Module that normalize data to match better the gaussian distribution</dd>
+<dt><strong><code>noise_scale</code></strong> :&ensp;<code>float</code></dt>
+<dd>Scaling factor for the noise</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NoiseSchedule:
+    &#34;&#34;&#34;Noise schedule for diffusion.
+
+    Args:
+        beta_t0 (float): Variance of the first diffusion step.
+        beta_t1 (float): Variance of the last diffusion step.
+        beta_exp (float): Power schedule exponent
+        num_steps (int): Number of diffusion step.
+        variance (str): choice of the sigma value for the denoising eq. Choices: &#34;beta&#34; or &#34;beta_tilde&#34;
+        clip (float): clipping value for the denoising steps
+        rescale (float): rescaling value to avoid vanishing signals unused by default (i.e 1)
+        repartition (str): shape of the schedule only power schedule is supported
+        sample_processor (SampleProcessor): Module that normalize data to match better the gaussian distribution
+        noise_scale (float): Scaling factor for the noise
+    &#34;&#34;&#34;
+    def __init__(self, beta_t0: float = 1e-4, beta_t1: float = 0.02, num_steps: int = 1000, variance: str = &#39;beta&#39;,
+                 clip: float = 5., rescale: float = 1., device=&#39;cuda&#39;, beta_exp: float = 1,
+                 repartition: str = &#34;power&#34;, alpha_sigmoid: dict = {}, n_bands: tp.Optional[int] = None,
+                 sample_processor: SampleProcessor = SampleProcessor(), noise_scale: float = 1.0, **kwargs):
+
+        self.beta_t0 = beta_t0
+        self.beta_t1 = beta_t1
+        self.variance = variance
+        self.num_steps = num_steps
+        self.clip = clip
+        self.sample_processor = sample_processor
+        self.rescale = rescale
+        self.n_bands = n_bands
+        self.noise_scale = noise_scale
+        assert n_bands is None
+        if repartition == &#34;power&#34;:
+            self.betas = torch.linspace(beta_t0 ** (1 / beta_exp), beta_t1 ** (1 / beta_exp), num_steps,
+                                        device=device, dtype=torch.float) ** beta_exp
+        else:
+            raise RuntimeError(&#39;Not implemented&#39;)
+        self.rng = random.Random(1234)
+
+    def get_beta(self, step: tp.Union[int, torch.Tensor]):
+        if self.n_bands is None:
+            return self.betas[step]
+        else:
+            return self.betas[:, step]  # [n_bands, len(step)]
+
+    def get_initial_noise(self, x: torch.Tensor):
+        if self.n_bands is None:
+            return torch.randn_like(x)
+        return torch.randn((x.size(0), self.n_bands, x.size(2)))
+
+    def get_alpha_bar(self, step: tp.Optional[tp.Union[int, torch.Tensor]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Return &#39;alpha_bar&#39;, either for a given step, or as a tensor with its value for each step.&#34;&#34;&#34;
+        if step is None:
+            return (1 - self.betas).cumprod(dim=-1)  # works for simgle and multi bands
+        if type(step) is int:
+            return (1 - self.betas[:step + 1]).prod()
+        else:
+            return (1 - self.betas).cumprod(dim=0)[step].view(-1, 1, 1)
+
+    def get_training_item(self, x: torch.Tensor, tensor_step: bool = False) -&gt; TrainingItem:
+        &#34;&#34;&#34;Create a noisy data item for diffusion model training:
+
+        Args:
+            x (torch.Tensor): clean audio data torch.tensor(bs, 1, T)
+            tensor_step (bool): If tensor_step = false, only one step t is sample,
+                the whole batch is diffused to the same step and t is int.
+                If tensor_step = true, t is a tensor of size (x.size(0),)
+                every element of the batch is diffused to a independently sampled.
+        &#34;&#34;&#34;
+        step: tp.Union[int, torch.Tensor]
+        if tensor_step:
+            bs = x.size(0)
+            step = torch.randint(0, self.num_steps, size=(bs,), device=x.device)
+        else:
+            step = self.rng.randrange(self.num_steps)
+        alpha_bar = self.get_alpha_bar(step)  # [batch_size, n_bands, 1]
+
+        x = self.sample_processor.project_sample(x)
+        noise = torch.randn_like(x)
+        noisy = (alpha_bar.sqrt() / self.rescale) * x + (1 - alpha_bar).sqrt() * noise * self.noise_scale
+        return TrainingItem(noisy, noise, step)
+
+    def generate(self, model: torch.nn.Module, initial: tp.Optional[torch.Tensor] = None,
+                 condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+        &#34;&#34;&#34;Full ddpm reverse process.
+
+        Args:
+            model (nn.Module): Diffusion model.
+            initial (tensor): Initial Noise.
+            condition (tensor): Input conditionning Tensor (e.g. encodec compressed representation).
+            return_list (bool): Whether to return the whole process or only the sampled point.
+        &#34;&#34;&#34;
+        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+        current = initial
+        iterates = [initial]
+        for step in range(self.num_steps)[::-1]:
+            with torch.no_grad():
+                estimate = model(current, step, condition=condition).sample
+            alpha = 1 - self.betas[step]
+            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+            previous_alpha_bar = self.get_alpha_bar(step=step - 1)
+            if step == 0:
+                sigma2 = 0
+            elif self.variance == &#39;beta&#39;:
+                sigma2 = 1 - alpha
+            elif self.variance == &#39;beta_tilde&#39;:
+                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+            elif self.variance == &#39;none&#39;:
+                sigma2 = 0
+            else:
+                raise ValueError(f&#39;Invalid variance type {self.variance}&#39;)
+
+            if sigma2 &gt; 0:
+                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+            if self.clip:
+                previous = previous.clamp(-self.clip, self.clip)
+            current = previous
+            alpha_bar = previous_alpha_bar
+            if step == 0:
+                previous *= self.rescale
+            if return_list:
+                iterates.append(previous.cpu())
+
+        if return_list:
+            return iterates
+        else:
+            return self.sample_processor.return_sample(previous)
+
+    def generate_subsampled(self, model: torch.nn.Module, initial: torch.Tensor, step_list: tp.Optional[list] = None,
+                            condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+        &#34;&#34;&#34;Reverse process that only goes through Markov chain states in step_list.&#34;&#34;&#34;
+        if step_list is None:
+            step_list = list(range(1000))[::-50] + [0]
+        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+        alpha_bars_subsampled = (1 - self.betas).cumprod(dim=0)[list(reversed(step_list))].cpu()
+        betas_subsampled = betas_from_alpha_bar(alpha_bars_subsampled)
+        current = initial * self.noise_scale
+        iterates = [current]
+        for idx, step in enumerate(step_list[:-1]):
+            with torch.no_grad():
+                estimate = model(current, step, condition=condition).sample * self.noise_scale
+            alpha = 1 - betas_subsampled[-1 - idx]
+            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+            previous_alpha_bar = self.get_alpha_bar(step_list[idx + 1])
+            if step == step_list[-2]:
+                sigma2 = 0
+                previous_alpha_bar = torch.tensor(1.0)
+            else:
+                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+            if sigma2 &gt; 0:
+                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+            if self.clip:
+                previous = previous.clamp(-self.clip, self.clip)
+            current = previous
+            alpha_bar = previous_alpha_bar
+            if step == 0:
+                previous *= self.rescale
+            if return_list:
+                iterates.append(previous.cpu())
+        if return_list:
+            return iterates
+        else:
+            return self.sample_processor.return_sample(previous)</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, model: torch.nn.modules.module.Module, initial: Optional[torch.Tensor] = None, condition: Optional[torch.Tensor] = None, return_list: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Full ddpm reverse process.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>model</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Diffusion model.</dd>
+<dt><strong><code>initial</code></strong> :&ensp;<code>tensor</code></dt>
+<dd>Initial Noise.</dd>
+<dt><strong><code>condition</code></strong> :&ensp;<code>tensor</code></dt>
+<dd>Input conditionning Tensor (e.g. encodec compressed representation).</dd>
+<dt><strong><code>return_list</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to return the whole process or only the sampled point.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, model: torch.nn.Module, initial: tp.Optional[torch.Tensor] = None,
+             condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+    &#34;&#34;&#34;Full ddpm reverse process.
+
+    Args:
+        model (nn.Module): Diffusion model.
+        initial (tensor): Initial Noise.
+        condition (tensor): Input conditionning Tensor (e.g. encodec compressed representation).
+        return_list (bool): Whether to return the whole process or only the sampled point.
+    &#34;&#34;&#34;
+    alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+    current = initial
+    iterates = [initial]
+    for step in range(self.num_steps)[::-1]:
+        with torch.no_grad():
+            estimate = model(current, step, condition=condition).sample
+        alpha = 1 - self.betas[step]
+        previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+        previous_alpha_bar = self.get_alpha_bar(step=step - 1)
+        if step == 0:
+            sigma2 = 0
+        elif self.variance == &#39;beta&#39;:
+            sigma2 = 1 - alpha
+        elif self.variance == &#39;beta_tilde&#39;:
+            sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+        elif self.variance == &#39;none&#39;:
+            sigma2 = 0
+        else:
+            raise ValueError(f&#39;Invalid variance type {self.variance}&#39;)
+
+        if sigma2 &gt; 0:
+            previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+        if self.clip:
+            previous = previous.clamp(-self.clip, self.clip)
+        current = previous
+        alpha_bar = previous_alpha_bar
+        if step == 0:
+            previous *= self.rescale
+        if return_list:
+            iterates.append(previous.cpu())
+
+    if return_list:
+        return iterates
+    else:
+        return self.sample_processor.return_sample(previous)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.generate_subsampled"><code class="name flex">
+<span>def <span class="ident">generate_subsampled</span></span>(<span>self, model: torch.nn.modules.module.Module, initial: torch.Tensor, step_list: Optional[list] = None, condition: Optional[torch.Tensor] = None, return_list: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Reverse process that only goes through Markov chain states in step_list.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_subsampled(self, model: torch.nn.Module, initial: torch.Tensor, step_list: tp.Optional[list] = None,
+                        condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
+    &#34;&#34;&#34;Reverse process that only goes through Markov chain states in step_list.&#34;&#34;&#34;
+    if step_list is None:
+        step_list = list(range(1000))[::-50] + [0]
+    alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
+    alpha_bars_subsampled = (1 - self.betas).cumprod(dim=0)[list(reversed(step_list))].cpu()
+    betas_subsampled = betas_from_alpha_bar(alpha_bars_subsampled)
+    current = initial * self.noise_scale
+    iterates = [current]
+    for idx, step in enumerate(step_list[:-1]):
+        with torch.no_grad():
+            estimate = model(current, step, condition=condition).sample * self.noise_scale
+        alpha = 1 - betas_subsampled[-1 - idx]
+        previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
+        previous_alpha_bar = self.get_alpha_bar(step_list[idx + 1])
+        if step == step_list[-2]:
+            sigma2 = 0
+            previous_alpha_bar = torch.tensor(1.0)
+        else:
+            sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
+        if sigma2 &gt; 0:
+            previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
+        if self.clip:
+            previous = previous.clamp(-self.clip, self.clip)
+        current = previous
+        alpha_bar = previous_alpha_bar
+        if step == 0:
+            previous *= self.rescale
+        if return_list:
+            iterates.append(previous.cpu())
+    if return_list:
+        return iterates
+    else:
+        return self.sample_processor.return_sample(previous)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_alpha_bar"><code class="name flex">
+<span>def <span class="ident">get_alpha_bar</span></span>(<span>self, step: Union[int, torch.Tensor, None] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return 'alpha_bar', either for a given step, or as a tensor with its value for each step.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_alpha_bar(self, step: tp.Optional[tp.Union[int, torch.Tensor]] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Return &#39;alpha_bar&#39;, either for a given step, or as a tensor with its value for each step.&#34;&#34;&#34;
+    if step is None:
+        return (1 - self.betas).cumprod(dim=-1)  # works for simgle and multi bands
+    if type(step) is int:
+        return (1 - self.betas[:step + 1]).prod()
+    else:
+        return (1 - self.betas).cumprod(dim=0)[step].view(-1, 1, 1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_beta"><code class="name flex">
+<span>def <span class="ident">get_beta</span></span>(<span>self, step: Union[int, torch.Tensor])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_beta(self, step: tp.Union[int, torch.Tensor]):
+    if self.n_bands is None:
+        return self.betas[step]
+    else:
+        return self.betas[:, step]  # [n_bands, len(step)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_initial_noise"><code class="name flex">
+<span>def <span class="ident">get_initial_noise</span></span>(<span>self, x: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_initial_noise(self, x: torch.Tensor):
+    if self.n_bands is None:
+        return torch.randn_like(x)
+    return torch.randn((x.size(0), self.n_bands, x.size(2)))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_training_item"><code class="name flex">
+<span>def <span class="ident">get_training_item</span></span>(<span>self, x: torch.Tensor, tensor_step: bool = False) ‑> <a title="audiocraft.modules.diffusion_schedule.TrainingItem" href="#audiocraft.modules.diffusion_schedule.TrainingItem">TrainingItem</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create a noisy data item for diffusion model training:</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>clean audio data torch.tensor(bs, 1, T)</dd>
+<dt><strong><code>tensor_step</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If tensor_step = false, only one step t is sample,
+the whole batch is diffused to the same step and t is int.
+If tensor_step = true, t is a tensor of size (x.size(0),)
+every element of the batch is diffused to a independently sampled.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_training_item(self, x: torch.Tensor, tensor_step: bool = False) -&gt; TrainingItem:
+    &#34;&#34;&#34;Create a noisy data item for diffusion model training:
+
+    Args:
+        x (torch.Tensor): clean audio data torch.tensor(bs, 1, T)
+        tensor_step (bool): If tensor_step = false, only one step t is sample,
+            the whole batch is diffused to the same step and t is int.
+            If tensor_step = true, t is a tensor of size (x.size(0),)
+            every element of the batch is diffused to a independently sampled.
+    &#34;&#34;&#34;
+    step: tp.Union[int, torch.Tensor]
+    if tensor_step:
+        bs = x.size(0)
+        step = torch.randint(0, self.num_steps, size=(bs,), device=x.device)
+    else:
+        step = self.rng.randrange(self.num_steps)
+    alpha_bar = self.get_alpha_bar(step)  # [batch_size, n_bands, 1]
+
+    x = self.sample_processor.project_sample(x)
+    noise = torch.randn_like(x)
+    noisy = (alpha_bar.sqrt() / self.rescale) * x + (1 - alpha_bar).sqrt() * noise * self.noise_scale
+    return TrainingItem(noisy, noise, step)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor"><code class="flex name class">
+<span>class <span class="ident">SampleProcessor</span></span>
+<span>(</span><span>*args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all neural network modules.</p>
+<p>Your models should also subclass this class.</p>
+<p>Modules can also contain other Modules, allowing to nest them in
+a tree structure. You can assign the submodules as regular attributes::</p>
+<pre><code>import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+</code></pre>
+<p>Submodules assigned in this way will be registered, and will have their
+parameters converted too when you call :meth:<code>to</code>, etc.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>As per the example above, an <code>__init__()</code> call to the parent class
+must be made before assignment on the child.</p>
+</div>
+<p>:ivar training: Boolean represents whether this module is in training or
+evaluation mode.
+:vartype training: bool</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        &#34;&#34;&#34;Project the original sample to the &#39;space&#39; where the diffusion will happen.&#34;&#34;&#34;
+        return x
+
+    def return_sample(self, z: torch.Tensor):
+        &#34;&#34;&#34;Project back from diffusion space to the actual sample space.&#34;&#34;&#34;
+        return z</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor">MultiBandProcessor</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.project_sample"><code class="name flex">
+<span>def <span class="ident">project_sample</span></span>(<span>self, x: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Project the original sample to the 'space' where the diffusion will happen.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def project_sample(self, x: torch.Tensor):
+    &#34;&#34;&#34;Project the original sample to the &#39;space&#39; where the diffusion will happen.&#34;&#34;&#34;
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.SampleProcessor.return_sample"><code class="name flex">
+<span>def <span class="ident">return_sample</span></span>(<span>self, z: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Project back from diffusion space to the actual sample space.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def return_sample(self, z: torch.Tensor):
+    &#34;&#34;&#34;Project back from diffusion space to the actual sample space.&#34;&#34;&#34;
+    return z</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.TrainingItem"><code class="flex name class">
+<span>class <span class="ident">TrainingItem</span></span>
+<span>(</span><span>noisy, noise, step)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>TrainingItem(noisy, noise, step)</p></div>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.diffusion_schedule.TrainingItem.noise"><code class="name">var <span class="ident">noise</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.TrainingItem.noisy"><code class="name">var <span class="ident">noisy</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+<dt id="audiocraft.modules.diffusion_schedule.TrainingItem.step"><code class="name">var <span class="ident">step</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 2</p></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.diffusion_schedule.betas_from_alpha_bar" href="#audiocraft.modules.diffusion_schedule.betas_from_alpha_bar">betas_from_alpha_bar</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor">MultiBandProcessor</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.call_super_init" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.dump_patches" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.mean" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.mean">mean</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.std" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.std">std</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.target_std" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.target_std">target_std</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.MultiBandProcessor.training" href="#audiocraft.modules.diffusion_schedule.MultiBandProcessor.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule">NoiseSchedule</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.generate" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.generate">generate</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.generate_subsampled" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.generate_subsampled">generate_subsampled</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_alpha_bar" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.get_alpha_bar">get_alpha_bar</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_beta" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.get_beta">get_beta</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_initial_noise" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.get_initial_noise">get_initial_noise</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.NoiseSchedule.get_training_item" href="#audiocraft.modules.diffusion_schedule.NoiseSchedule.get_training_item">get_training_item</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor" href="#audiocraft.modules.diffusion_schedule.SampleProcessor">SampleProcessor</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.call_super_init" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.dump_patches" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.forward" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.project_sample" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.project_sample">project_sample</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.return_sample" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.return_sample">return_sample</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.SampleProcessor.training" href="#audiocraft.modules.diffusion_schedule.SampleProcessor.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.diffusion_schedule.TrainingItem" href="#audiocraft.modules.diffusion_schedule.TrainingItem">TrainingItem</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.diffusion_schedule.TrainingItem.noise" href="#audiocraft.modules.diffusion_schedule.TrainingItem.noise">noise</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.TrainingItem.noisy" href="#audiocraft.modules.diffusion_schedule.TrainingItem.noisy">noisy</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule.TrainingItem.step" href="#audiocraft.modules.diffusion_schedule.TrainingItem.step">step</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/index.html b/api_docs/audiocraft/modules/index.html
new file mode 100644
index 00000000..0fa41878
--- /dev/null
+++ b/api_docs/audiocraft/modules/index.html
@@ -0,0 +1,144 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules API documentation</title>
+<meta name="description" content="Modules used for building the models." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules</code></h1>
+</header>
+<section id="section-intro">
+<p>Modules used for building the models.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Modules used for building the models.&#34;&#34;&#34;
+
+# flake8: noqa
+from .conv import (
+    NormConv1d,
+    NormConv2d,
+    NormConvTranspose1d,
+    NormConvTranspose2d,
+    StreamableConv1d,
+    StreamableConvTranspose1d,
+    pad_for_conv1d,
+    pad1d,
+    unpad1d,
+)
+from .lstm import StreamableLSTM
+from .seanet import SEANetEncoder, SEANetDecoder
+from .transformer import StreamingTransformer</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.modules.activations" href="activations.html">audiocraft.modules.activations</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.chroma" href="chroma.html">audiocraft.modules.chroma</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.codebooks_patterns" href="codebooks_patterns.html">audiocraft.modules.codebooks_patterns</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.conditioners" href="conditioners.html">audiocraft.modules.conditioners</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.conv" href="conv.html">audiocraft.modules.conv</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.diffusion_schedule" href="diffusion_schedule.html">audiocraft.modules.diffusion_schedule</a></code></dt>
+<dd>
+<div class="desc"><p>Functions for Noise Schedule, defines diffusion process, reverse process and data processor.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.lstm" href="lstm.html">audiocraft.modules.lstm</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.rope" href="rope.html">audiocraft.modules.rope</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.seanet" href="seanet.html">audiocraft.modules.seanet</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.streaming" href="streaming.html">audiocraft.modules.streaming</a></code></dt>
+<dd>
+<div class="desc"><p>Streaming module API that should be implemented by all Streaming components,</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.transformer" href="transformer.html">audiocraft.modules.transformer</a></code></dt>
+<dd>
+<div class="desc"><p>Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field …</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.modules.activations" href="activations.html">audiocraft.modules.activations</a></code></li>
+<li><code><a title="audiocraft.modules.chroma" href="chroma.html">audiocraft.modules.chroma</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns" href="codebooks_patterns.html">audiocraft.modules.codebooks_patterns</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners" href="conditioners.html">audiocraft.modules.conditioners</a></code></li>
+<li><code><a title="audiocraft.modules.conv" href="conv.html">audiocraft.modules.conv</a></code></li>
+<li><code><a title="audiocraft.modules.diffusion_schedule" href="diffusion_schedule.html">audiocraft.modules.diffusion_schedule</a></code></li>
+<li><code><a title="audiocraft.modules.lstm" href="lstm.html">audiocraft.modules.lstm</a></code></li>
+<li><code><a title="audiocraft.modules.rope" href="rope.html">audiocraft.modules.rope</a></code></li>
+<li><code><a title="audiocraft.modules.seanet" href="seanet.html">audiocraft.modules.seanet</a></code></li>
+<li><code><a title="audiocraft.modules.streaming" href="streaming.html">audiocraft.modules.streaming</a></code></li>
+<li><code><a title="audiocraft.modules.transformer" href="transformer.html">audiocraft.modules.transformer</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/lstm.html b/api_docs/audiocraft/modules/lstm.html
new file mode 100644
index 00000000..ad20d54e
--- /dev/null
+++ b/api_docs/audiocraft/modules/lstm.html
@@ -0,0 +1,177 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.lstm API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.lstm</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn
+
+
+class StreamableLSTM(nn.Module):
+    &#34;&#34;&#34;LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    &#34;&#34;&#34;
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM"><code class="flex name class">
+<span>class <span class="ident">StreamableLSTM</span></span>
+<span>(</span><span>dimension: int, num_layers: int = 2, skip: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LSTM without worrying about the hidden state, nor the layout of the data.
+Expects input as convolutional layout.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableLSTM(nn.Module):
+    &#34;&#34;&#34;LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    &#34;&#34;&#34;
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = x.permute(2, 0, 1)
+    y, _ = self.lstm(x)
+    if self.skip:
+        y = y + x
+    y = y.permute(1, 2, 0)
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.lstm.StreamableLSTM" href="#audiocraft.modules.lstm.StreamableLSTM">StreamableLSTM</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.call_super_init" href="#audiocraft.modules.lstm.StreamableLSTM.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.dump_patches" href="#audiocraft.modules.lstm.StreamableLSTM.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.forward" href="#audiocraft.modules.lstm.StreamableLSTM.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.training" href="#audiocraft.modules.lstm.StreamableLSTM.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/rope.html b/api_docs/audiocraft/modules/rope.html
new file mode 100644
index 00000000..56bef835
--- /dev/null
+++ b/api_docs/audiocraft/modules/rope.html
@@ -0,0 +1,600 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.rope API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.rope</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from torch import nn
+import torch
+
+
+class XPos(nn.Module):
+    &#34;&#34;&#34;Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
+    This applies an exponential decay to the RoPE rotation matrix.
+
+    Args:
+        dim (int): Embedding dimension.
+        smoothing (float): Smoothing factor applied to the decay rates.
+        base_scale (int): Base decay rate, given in terms of scaling time.
+        device (torch.device, optional): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
+                 device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+        self.base_scale = base_scale
+
+        half_dim = dim // 2
+        adim = torch.arange(half_dim, device=device, dtype=dtype)
+        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
+        self.register_buffer(&#34;decay_rates&#34;, decay_rates)
+        self.decay: tp.Optional[torch.Tensor] = None
+
+    def get_decay(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.&#34;&#34;&#34;
+        if self.decay is None or end &gt; self.decay.shape[0]:
+            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+            power = idx / self.base_scale
+            scale = self.decay_rates ** power.unsqueeze(-1)
+            self.decay = torch.polar(scale, torch.zeros_like(scale))
+        return self.decay[start:end]  # [T, C/2]
+
+
+class RotaryEmbedding(nn.Module):
+    &#34;&#34;&#34;Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+
+    Args:
+        dim (int): Embedding dimension (twice the number of frequencies).
+        max_period (float): Maximum period of the rotation frequencies.
+        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
+        scale (float): Scale of positional embedding, set to 0 to deactivate.
+        device (torch.device, optional): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
+                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        self.scale = scale
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+
+        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
+        frequencies = 1.0 / (max_period ** (adim / dim))
+        self.register_buffer(&#34;frequencies&#34;, frequencies)
+        self.rotation: tp.Optional[torch.Tensor] = None
+
+        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
+
+    def get_rotation(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.&#34;&#34;&#34;
+        if self.rotation is None or end &gt; self.rotation.shape[0]:
+            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+            angles = torch.outer(idx, self.frequencies)
+            self.rotation = torch.polar(torch.ones_like(angles), angles)
+        return self.rotation[start:end]
+
+    def rotate(self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False):
+        &#34;&#34;&#34;Apply rope rotation to query or key tensor.&#34;&#34;&#34;
+        T = x.shape[time_dim]
+        target_shape = [1] * x.dim()
+        target_shape[time_dim] = T
+        target_shape[-1] = -1
+        rotation = self.get_rotation(start, start + T).view(target_shape)
+
+        if self.xpos:
+            decay = self.xpos.get_decay(start, start + T).view(target_shape)
+        else:
+            decay = 1.0
+
+        if invert_decay:
+            decay = decay ** -1
+
+        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+        x_out = torch.view_as_real(x_complex * scaled_rotation).view_as(x)
+
+        return x_out.type_as(x)
+
+    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1):
+        &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+        Supports streaming mode, in which query and key are not expected to have the same shape.
+        In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
+        query will be [C] (typically C == 1).
+
+        Args:
+            query (torch.Tensor): Query to rotate.
+            key (torch.Tensor): Key to rotate.
+            start (int): Start index of the sequence for time offset.
+            time_dim (int): which dimension represent the time steps.
+        &#34;&#34;&#34;
+        query_timesteps = query.shape[time_dim]
+        key_timesteps = key.shape[time_dim]
+        streaming_offset = key_timesteps - query_timesteps
+
+        query_out = self.rotate(query, start + streaming_offset, time_dim)
+        key_out = self.rotate(key, start, time_dim, invert_decay=True)
+
+        return query_out, key_out</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding"><code class="flex name class">
+<span>class <span class="ident">RotaryEmbedding</span></span>
+<span>(</span><span>dim: int, max_period: float = 10000.0, xpos: bool = False, scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Rotary positional embedding (RoPE) from <a href="https://arxiv.org/abs/2104.09864">Su et al 2022</a>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Embedding dimension (twice the number of frequencies).</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the rotation frequencies.</dd>
+<dt><strong><code>xpos</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xPos, applies an exponential decay to rotation matrix.</dd>
+<dt><strong><code>scale</code></strong> :&ensp;<code>float</code></dt>
+<dd>Scale of positional embedding, set to 0 to deactivate.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code>, optional</dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class RotaryEmbedding(nn.Module):
+    &#34;&#34;&#34;Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+
+    Args:
+        dim (int): Embedding dimension (twice the number of frequencies).
+        max_period (float): Maximum period of the rotation frequencies.
+        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
+        scale (float): Scale of positional embedding, set to 0 to deactivate.
+        device (torch.device, optional): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
+                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        self.scale = scale
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+
+        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
+        frequencies = 1.0 / (max_period ** (adim / dim))
+        self.register_buffer(&#34;frequencies&#34;, frequencies)
+        self.rotation: tp.Optional[torch.Tensor] = None
+
+        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
+
+    def get_rotation(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.&#34;&#34;&#34;
+        if self.rotation is None or end &gt; self.rotation.shape[0]:
+            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+            angles = torch.outer(idx, self.frequencies)
+            self.rotation = torch.polar(torch.ones_like(angles), angles)
+        return self.rotation[start:end]
+
+    def rotate(self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False):
+        &#34;&#34;&#34;Apply rope rotation to query or key tensor.&#34;&#34;&#34;
+        T = x.shape[time_dim]
+        target_shape = [1] * x.dim()
+        target_shape[time_dim] = T
+        target_shape[-1] = -1
+        rotation = self.get_rotation(start, start + T).view(target_shape)
+
+        if self.xpos:
+            decay = self.xpos.get_decay(start, start + T).view(target_shape)
+        else:
+            decay = 1.0
+
+        if invert_decay:
+            decay = decay ** -1
+
+        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+        x_out = torch.view_as_real(x_complex * scaled_rotation).view_as(x)
+
+        return x_out.type_as(x)
+
+    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1):
+        &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+        Supports streaming mode, in which query and key are not expected to have the same shape.
+        In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
+        query will be [C] (typically C == 1).
+
+        Args:
+            query (torch.Tensor): Query to rotate.
+            key (torch.Tensor): Key to rotate.
+            start (int): Start index of the sequence for time offset.
+            time_dim (int): which dimension represent the time steps.
+        &#34;&#34;&#34;
+        query_timesteps = query.shape[time_dim]
+        key_timesteps = key.shape[time_dim]
+        streaming_offset = key_timesteps - query_timesteps
+
+        query_out = self.rotate(query, start + streaming_offset, time_dim)
+        key_out = self.rotate(key, start, time_dim, invert_decay=True)
+
+        return query_out, key_out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.get_rotation"><code class="name flex">
+<span>def <span class="ident">get_rotation</span></span>(<span>self, start: int, end: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create complex rotation tensor, cache values for fast computation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_rotation(self, start: int, end: int):
+    &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.&#34;&#34;&#34;
+    if self.rotation is None or end &gt; self.rotation.shape[0]:
+        assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+        idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+        angles = torch.outer(idx, self.frequencies)
+        self.rotation = torch.polar(torch.ones_like(angles), angles)
+    return self.rotation[start:end]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.rotate"><code class="name flex">
+<span>def <span class="ident">rotate</span></span>(<span>self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply rope rotation to query or key tensor.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def rotate(self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False):
+    &#34;&#34;&#34;Apply rope rotation to query or key tensor.&#34;&#34;&#34;
+    T = x.shape[time_dim]
+    target_shape = [1] * x.dim()
+    target_shape[time_dim] = T
+    target_shape[-1] = -1
+    rotation = self.get_rotation(start, start + T).view(target_shape)
+
+    if self.xpos:
+        decay = self.xpos.get_decay(start, start + T).view(target_shape)
+    else:
+        decay = 1.0
+
+    if invert_decay:
+        decay = decay ** -1
+
+    x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+    scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+    x_out = torch.view_as_real(x_complex * scaled_rotation).view_as(x)
+
+    return x_out.type_as(x)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.rotate_qk"><code class="name flex">
+<span>def <span class="ident">rotate_qk</span></span>(<span>self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply rope rotation to both query and key tensors.
+Supports streaming mode, in which query and key are not expected to have the same shape.
+In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
+query will be [C] (typically C == 1).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>query</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Query to rotate.</dd>
+<dt><strong><code>key</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Key to rotate.</dd>
+<dt><strong><code>start</code></strong> :&ensp;<code>int</code></dt>
+<dd>Start index of the sequence for time offset.</dd>
+<dt><strong><code>time_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>which dimension represent the time steps.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1):
+    &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+    Supports streaming mode, in which query and key are not expected to have the same shape.
+    In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
+    query will be [C] (typically C == 1).
+
+    Args:
+        query (torch.Tensor): Query to rotate.
+        key (torch.Tensor): Key to rotate.
+        start (int): Start index of the sequence for time offset.
+        time_dim (int): which dimension represent the time steps.
+    &#34;&#34;&#34;
+    query_timesteps = query.shape[time_dim]
+    key_timesteps = key.shape[time_dim]
+    streaming_offset = key_timesteps - query_timesteps
+
+    query_out = self.rotate(query, start + streaming_offset, time_dim)
+    key_out = self.rotate(key, start, time_dim, invert_decay=True)
+
+    return query_out, key_out</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.rope.XPos"><code class="flex name class">
+<span>class <span class="ident">XPos</span></span>
+<span>(</span><span>dim: int, smoothing: float = 0.4, base_scale: int = 512, device=None, dtype: torch.dtype = torch.float32)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Length-extrapolatable positional embedding (xPos) from <a href="https://arxiv.org/abs/2212.10554v1">Sun et al 2022</a>.
+This applies an exponential decay to the RoPE rotation matrix.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Embedding dimension.</dd>
+<dt><strong><code>smoothing</code></strong> :&ensp;<code>float</code></dt>
+<dd>Smoothing factor applied to the decay rates.</dd>
+<dt><strong><code>base_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base decay rate, given in terms of scaling time.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code>, optional</dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class XPos(nn.Module):
+    &#34;&#34;&#34;Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
+    This applies an exponential decay to the RoPE rotation matrix.
+
+    Args:
+        dim (int): Embedding dimension.
+        smoothing (float): Smoothing factor applied to the decay rates.
+        base_scale (int): Base decay rate, given in terms of scaling time.
+        device (torch.device, optional): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
+                 device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+        self.base_scale = base_scale
+
+        half_dim = dim // 2
+        adim = torch.arange(half_dim, device=device, dtype=dtype)
+        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
+        self.register_buffer(&#34;decay_rates&#34;, decay_rates)
+        self.decay: tp.Optional[torch.Tensor] = None
+
+    def get_decay(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.&#34;&#34;&#34;
+        if self.decay is None or end &gt; self.decay.shape[0]:
+            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+            power = idx / self.base_scale
+            scale = self.decay_rates ** power.unsqueeze(-1)
+            self.decay = torch.polar(scale, torch.zeros_like(scale))
+        return self.decay[start:end]  # [T, C/2]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.rope.XPos.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.rope.XPos.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.get_decay"><code class="name flex">
+<span>def <span class="ident">get_decay</span></span>(<span>self, start: int, end: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create complex decay tensor, cache values for fast computation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_decay(self, start: int, end: int):
+    &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.&#34;&#34;&#34;
+    if self.decay is None or end &gt; self.decay.shape[0]:
+        assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+        idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+        power = idx / self.base_scale
+        scale = self.decay_rates ** power.unsqueeze(-1)
+        self.decay = torch.polar(scale, torch.zeros_like(scale))
+    return self.decay[start:end]  # [T, C/2]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.rope.RotaryEmbedding" href="#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.call_super_init" href="#audiocraft.modules.rope.RotaryEmbedding.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.dump_patches" href="#audiocraft.modules.rope.RotaryEmbedding.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.forward" href="#audiocraft.modules.rope.RotaryEmbedding.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.get_rotation" href="#audiocraft.modules.rope.RotaryEmbedding.get_rotation">get_rotation</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.rotate" href="#audiocraft.modules.rope.RotaryEmbedding.rotate">rotate</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.rotate_qk" href="#audiocraft.modules.rope.RotaryEmbedding.rotate_qk">rotate_qk</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.training" href="#audiocraft.modules.rope.RotaryEmbedding.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.rope.XPos" href="#audiocraft.modules.rope.XPos">XPos</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.rope.XPos.call_super_init" href="#audiocraft.modules.rope.XPos.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.dump_patches" href="#audiocraft.modules.rope.XPos.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.forward" href="#audiocraft.modules.rope.XPos.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.get_decay" href="#audiocraft.modules.rope.XPos.get_decay">get_decay</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.training" href="#audiocraft.modules.rope.XPos.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/seanet.html b/api_docs/audiocraft/modules/seanet.html
new file mode 100644
index 00000000..831a462b
--- /dev/null
+++ b/api_docs/audiocraft/modules/seanet.html
@@ -0,0 +1,879 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.seanet API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.seanet</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import numpy as np
+import torch.nn as nn
+
+from .conv import StreamableConv1d, StreamableConvTranspose1d
+from .lstm import StreamableLSTM
+
+
+class SEANetResnetBlock(nn.Module):
+    &#34;&#34;&#34;Residual block from SEANet model.
+
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
+                 activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, compress: int = 2, true_skip: bool = True):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), &#39;Number of kernel sizes should match number of dilations&#39;
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
+                                 norm=norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
+                                             causal=causal, pad_mode=pad_mode)
+
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+
+
+class SEANetEncoder(nn.Module):
+    &#34;&#34;&#34;SEANet encoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the encoder, it corresponds to the N first blocks.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(channels, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      norm=block_norm, norm_params=norm_params,
+                                      activation=activation, activation_params=activation_params,
+                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
+                                 kernel_size=ratio * 2, stride=ratio,
+                                 norm=block_norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+            mult *= 2
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        model += [
+            act(**activation_params),
+            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class SEANetDecoder(nn.Module):
+    &#34;&#34;&#34;SEANet decoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(dimension, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= self.n_blocks - (i + 1) else norm
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
+                                          kernel_size=ratio * 2, stride=ratio,
+                                          norm=block_norm, norm_kwargs=norm_params,
+                                          causal=causal, trim_right_ratio=trim_right_ratio),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      activation=activation, activation_params=activation_params,
+                                      norm=block_norm, norm_params=norm_params, causal=causal,
+                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            mult //= 2
+
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamableConv1d(n_filters, channels, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [
+                final_act(**final_activation_params)
+            ]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, z):
+        y = self.model(z)
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder"><code class="flex name class">
+<span>class <span class="ident">SEANetDecoder</span></span>
+<span>(</span><span>channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3, ratios: List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, final_activation: Optional[str] = None, final_activation_params: Optional[dict] = None, norm: str = 'none', norm_params: Dict[str, Any] = {}, kernel_size: int = 7, last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False, pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0, disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SEANet decoder.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio channels.</dd>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate representation dimension.</dd>
+<dt><strong><code>n_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base width for the model.</dd>
+<dt><strong><code>n_residual_layers</code></strong> :&ensp;<code>int</code></dt>
+<dd>nb of residual layers.</dd>
+<dt><strong><code>ratios</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>kernel size and stride ratios.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>final_activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Final activation function after all convolutions.</dd>
+<dt><strong><code>final_activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>last_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>residual_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the residual layers.</dd>
+<dt><strong><code>dilation_base</code></strong> :&ensp;<code>int</code></dt>
+<dd>How much to increase the dilation with each layer.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple.
+(streamable) convolution as the skip connection in the residual network blocks.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>lstm</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of LSTM layers at the end of the encoder.</dd>
+<dt><strong><code>disable_norm_outer_blocks</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of blocks for which we don't apply norm.
+For the decoder, it corresponds to the N last blocks.</dd>
+<dt><strong><code>trim_right_ratio</code></strong> :&ensp;<code>float</code></dt>
+<dd>Ratio for trimming at the right of the transposed convolution under the causal setup.
+If equal to 1.0, it means that all the trimming is done at the right.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetDecoder(nn.Module):
+    &#34;&#34;&#34;SEANet decoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(dimension, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= self.n_blocks - (i + 1) else norm
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
+                                          kernel_size=ratio * 2, stride=ratio,
+                                          norm=block_norm, norm_kwargs=norm_params,
+                                          causal=causal, trim_right_ratio=trim_right_ratio),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      activation=activation, activation_params=activation_params,
+                                      norm=block_norm, norm_params=norm_params, causal=causal,
+                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            mult //= 2
+
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamableConv1d(n_filters, channels, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [
+                final_act(**final_activation_params)
+            ]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, z):
+        y = self.model(z)
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, z) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, z):
+    y = self.model(z)
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder"><code class="flex name class">
+<span>class <span class="ident">SEANetEncoder</span></span>
+<span>(</span><span>channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3, ratios: List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, norm: str = 'none', norm_params: Dict[str, Any] = {}, kernel_size: int = 7, last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False, pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0, disable_norm_outer_blocks: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SEANet encoder.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio channels.</dd>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate representation dimension.</dd>
+<dt><strong><code>n_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base width for the model.</dd>
+<dt><strong><code>n_residual_layers</code></strong> :&ensp;<code>int</code></dt>
+<dd>nb of residual layers.</dd>
+<dt><strong><code>ratios</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>kernel size and stride ratios. The encoder uses downsampling ratios instead of
+upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+that must match the decoder order. We use the decoder order as some models may only employ the decoder.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>last_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>residual_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the residual layers.</dd>
+<dt><strong><code>dilation_base</code></strong> :&ensp;<code>int</code></dt>
+<dd>How much to increase the dilation with each layer.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple
+(streamable) convolution as the skip connection in the residual network blocks.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>lstm</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of LSTM layers at the end of the encoder.</dd>
+<dt><strong><code>disable_norm_outer_blocks</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of blocks for which we don't apply norm.
+For the encoder, it corresponds to the N first blocks.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetEncoder(nn.Module):
+    &#34;&#34;&#34;SEANet encoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the encoder, it corresponds to the N first blocks.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(channels, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      norm=block_norm, norm_params=norm_params,
+                                      activation=activation, activation_params=activation_params,
+                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
+                                 kernel_size=ratio * 2, stride=ratio,
+                                 norm=block_norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+            mult *= 2
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        model += [
+            act(**activation_params),
+            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    return self.model(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock"><code class="flex name class">
+<span>class <span class="ident">SEANetResnetBlock</span></span>
+<span>(</span><span>dim: int, kernel_sizes: List[int] = [3, 1], dilations: List[int] = [1, 1], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, norm: str = 'none', norm_params: Dict[str, Any] = {}, causal: bool = False, pad_mode: str = 'reflect', compress: int = 2, true_skip: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual block from SEANet model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the input/output.</dd>
+<dt><strong><code>kernel_sizes</code></strong> :&ensp;<code>list</code></dt>
+<dd>List of kernel sizes for the convolutions.</dd>
+<dt><strong><code>dilations</code></strong> :&ensp;<code>list</code></dt>
+<dd>List of dilations for the convolutions.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple
+(streamable) convolution as the skip connection.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetResnetBlock(nn.Module):
+    &#34;&#34;&#34;Residual block from SEANet model.
+
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
+                 activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, compress: int = 2, true_skip: bool = True):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), &#39;Number of kernel sizes should match number of dilations&#39;
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
+                                 norm=norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
+                                             causal=causal, pad_mode=pad_mode)
+
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    return self.shortcut(x) + self.block(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetDecoder" href="#audiocraft.modules.seanet.SEANetDecoder">SEANetDecoder</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.call_super_init" href="#audiocraft.modules.seanet.SEANetDecoder.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.dump_patches" href="#audiocraft.modules.seanet.SEANetDecoder.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.forward" href="#audiocraft.modules.seanet.SEANetDecoder.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.training" href="#audiocraft.modules.seanet.SEANetDecoder.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetEncoder" href="#audiocraft.modules.seanet.SEANetEncoder">SEANetEncoder</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.call_super_init" href="#audiocraft.modules.seanet.SEANetEncoder.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.dump_patches" href="#audiocraft.modules.seanet.SEANetEncoder.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.forward" href="#audiocraft.modules.seanet.SEANetEncoder.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.training" href="#audiocraft.modules.seanet.SEANetEncoder.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetResnetBlock" href="#audiocraft.modules.seanet.SEANetResnetBlock">SEANetResnetBlock</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.call_super_init" href="#audiocraft.modules.seanet.SEANetResnetBlock.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.dump_patches" href="#audiocraft.modules.seanet.SEANetResnetBlock.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.forward" href="#audiocraft.modules.seanet.SEANetResnetBlock.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.training" href="#audiocraft.modules.seanet.SEANetResnetBlock.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/streaming.html b/api_docs/audiocraft/modules/streaming.html
new file mode 100644
index 00000000..a3334924
--- /dev/null
+++ b/api_docs/audiocraft/modules/streaming.html
@@ -0,0 +1,561 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.streaming API documentation</title>
+<meta name="description" content="Streaming module API that should be implemented by all Streaming components," />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.streaming</code></h1>
+</header>
+<section id="section-intro">
+<p>Streaming module API that should be implemented by all Streaming components,</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Streaming module API that should be implemented by all Streaming components,
+&#34;&#34;&#34;
+
+from contextlib import contextmanager
+import typing as tp
+from torch import nn
+import torch
+
+
+State = tp.Dict[str, torch.Tensor]
+
+
+class StreamingModule(nn.Module):
+    &#34;&#34;&#34;Common API for streaming components.
+
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don&#39;t use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+
+    To set a streaming component in streaming state, use
+
+        with module.streaming():
+            ...
+
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    &#34;&#34;&#34;
+    def __init__(self) -&gt; None:
+        super().__init__()
+        self._streaming_state: State = {}
+        self._is_streaming = False
+
+    def _apply_named_streaming(self, fn: tp.Any):
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+
+    def _set_streaming(self, streaming: bool):
+        def _set_streaming(name, module):
+            module._is_streaming = streaming
+        self._apply_named_streaming(_set_streaming)
+
+    @contextmanager
+    def streaming(self):
+        &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.&#34;&#34;&#34;
+        self._set_streaming(True)
+        try:
+            yield
+        finally:
+            self._set_streaming(False)
+            self.reset_streaming()
+
+    def reset_streaming(self):
+        &#34;&#34;&#34;Reset the streaming state.&#34;&#34;&#34;
+        def _reset(name: str, module: StreamingModule):
+            module._streaming_state.clear()
+
+        self._apply_named_streaming(_reset)
+
+    def get_streaming_state(self) -&gt; State:
+        &#34;&#34;&#34;Return the streaming state, including that of sub-modules.&#34;&#34;&#34;
+        state: State = {}
+
+        def _add(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            for key, value in module._streaming_state.items():
+                state[name + key] = value
+
+        self._apply_named_streaming(_add)
+        return state
+
+    def set_streaming_state(self, state: State):
+        &#34;&#34;&#34;Set the streaming state, including that of sub-modules.&#34;&#34;&#34;
+        state = dict(state)
+
+        def _set(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            module._streaming_state.clear()
+            for key, value in list(state.items()):
+                # complexity is not ideal here, but probably fine.
+                if key.startswith(name):
+                    local_key = key[len(name):]
+                    if &#39;.&#39; not in local_key:
+                        module._streaming_state[local_key] = value
+                        del state[key]
+
+        self._apply_named_streaming(_set)
+        assert len(state) == 0, list(state.keys())
+
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+        Typically, for convolutions, this will add the final padding
+        and process the last buffer.
+
+        This should take an optional argument `x`, which will be provided
+        if a module before this one in the streaming pipeline has already
+        spitted out a flushed out buffer.
+        &#34;&#34;&#34;
+        if x is None:
+            return None
+        else:
+            return self(x)
+
+
+class StreamingSequential(StreamingModule, nn.Sequential):
+    &#34;&#34;&#34;A streaming compatible alternative of `nn.Sequential`.
+    &#34;&#34;&#34;
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        for module in self:
+            if isinstance(module, StreamingModule):
+                x = module.flush(x)
+            elif x is not None:
+                x = module(x)
+        return x</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule"><code class="flex name class">
+<span>class <span class="ident">StreamingModule</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Common API for streaming components.</p>
+<p>Each streaming component has a streaming state, which is just a dict[str, Tensor].
+By convention, the first dim of each tensor must be the batch size.
+Don't use dots in the key names, as this would clash with submodules
+(like in state_dict).</p>
+<p>If <code>self._is_streaming</code> is True, the component should use and remember
+the proper state inside <code>self._streaming_state</code>.</p>
+<p>To set a streaming component in streaming state, use</p>
+<pre><code>with module.streaming():
+    ...
+</code></pre>
+<p>This will automatically reset the streaming state when exiting the context manager.
+This also automatically propagates to all streaming children module.</p>
+<p>Some module might also implement the <code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">StreamingModule.flush()</a></code> method, although
+this one is trickier, as all parents module must be StreamingModule and implement
+it as well for it to work properly. See <code><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></code> after.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingModule(nn.Module):
+    &#34;&#34;&#34;Common API for streaming components.
+
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don&#39;t use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+
+    To set a streaming component in streaming state, use
+
+        with module.streaming():
+            ...
+
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    &#34;&#34;&#34;
+    def __init__(self) -&gt; None:
+        super().__init__()
+        self._streaming_state: State = {}
+        self._is_streaming = False
+
+    def _apply_named_streaming(self, fn: tp.Any):
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+
+    def _set_streaming(self, streaming: bool):
+        def _set_streaming(name, module):
+            module._is_streaming = streaming
+        self._apply_named_streaming(_set_streaming)
+
+    @contextmanager
+    def streaming(self):
+        &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.&#34;&#34;&#34;
+        self._set_streaming(True)
+        try:
+            yield
+        finally:
+            self._set_streaming(False)
+            self.reset_streaming()
+
+    def reset_streaming(self):
+        &#34;&#34;&#34;Reset the streaming state.&#34;&#34;&#34;
+        def _reset(name: str, module: StreamingModule):
+            module._streaming_state.clear()
+
+        self._apply_named_streaming(_reset)
+
+    def get_streaming_state(self) -&gt; State:
+        &#34;&#34;&#34;Return the streaming state, including that of sub-modules.&#34;&#34;&#34;
+        state: State = {}
+
+        def _add(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            for key, value in module._streaming_state.items():
+                state[name + key] = value
+
+        self._apply_named_streaming(_add)
+        return state
+
+    def set_streaming_state(self, state: State):
+        &#34;&#34;&#34;Set the streaming state, including that of sub-modules.&#34;&#34;&#34;
+        state = dict(state)
+
+        def _set(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            module._streaming_state.clear()
+            for key, value in list(state.items()):
+                # complexity is not ideal here, but probably fine.
+                if key.startswith(name):
+                    local_key = key[len(name):]
+                    if &#39;.&#39; not in local_key:
+                        module._streaming_state[local_key] = value
+                        del state[key]
+
+        self._apply_named_streaming(_set)
+        assert len(state) == 0, list(state.keys())
+
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+        Typically, for convolutions, this will add the final padding
+        and process the last buffer.
+
+        This should take an optional argument `x`, which will be provided
+        if a module before this one in the streaming pipeline has already
+        spitted out a flushed out buffer.
+        &#34;&#34;&#34;
+        if x is None:
+            return None
+        else:
+            return self(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.lm.LMModel" href="../models/lm.html#audiocraft.models.lm.LMModel">LMModel</a></li>
+<li><a title="audiocraft.modules.conditioners.ConditionFuser" href="conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></li>
+<li><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></li>
+<li><a title="audiocraft.modules.transformer.StreamingMultiheadAttention" href="transformer.html#audiocraft.modules.transformer.StreamingMultiheadAttention">StreamingMultiheadAttention</a></li>
+<li><a title="audiocraft.modules.transformer.StreamingTransformer" href="transformer.html#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule.flush"><code class="name flex">
+<span>def <span class="ident">flush</span></span>(<span>self, x: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Flush any remaining outputs that were waiting for completion.
+Typically, for convolutions, this will add the final padding
+and process the last buffer.</p>
+<p>This should take an optional argument <code>x</code>, which will be provided
+if a module before this one in the streaming pipeline has already
+spitted out a flushed out buffer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def flush(self, x: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+    Typically, for convolutions, this will add the final padding
+    and process the last buffer.
+
+    This should take an optional argument `x`, which will be provided
+    if a module before this one in the streaming pipeline has already
+    spitted out a flushed out buffer.
+    &#34;&#34;&#34;
+    if x is None:
+        return None
+    else:
+        return self(x)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.get_streaming_state"><code class="name flex">
+<span>def <span class="ident">get_streaming_state</span></span>(<span>self) ‑> Dict[str, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return the streaming state, including that of sub-modules.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_streaming_state(self) -&gt; State:
+    &#34;&#34;&#34;Return the streaming state, including that of sub-modules.&#34;&#34;&#34;
+    state: State = {}
+
+    def _add(name: str, module: StreamingModule):
+        if name:
+            name += &#34;.&#34;
+        for key, value in module._streaming_state.items():
+            state[name + key] = value
+
+    self._apply_named_streaming(_add)
+    return state</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.reset_streaming"><code class="name flex">
+<span>def <span class="ident">reset_streaming</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Reset the streaming state.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def reset_streaming(self):
+    &#34;&#34;&#34;Reset the streaming state.&#34;&#34;&#34;
+    def _reset(name: str, module: StreamingModule):
+        module._streaming_state.clear()
+
+    self._apply_named_streaming(_reset)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.set_streaming_state"><code class="name flex">
+<span>def <span class="ident">set_streaming_state</span></span>(<span>self, state: Dict[str, torch.Tensor])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the streaming state, including that of sub-modules.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_streaming_state(self, state: State):
+    &#34;&#34;&#34;Set the streaming state, including that of sub-modules.&#34;&#34;&#34;
+    state = dict(state)
+
+    def _set(name: str, module: StreamingModule):
+        if name:
+            name += &#34;.&#34;
+        module._streaming_state.clear()
+        for key, value in list(state.items()):
+            # complexity is not ideal here, but probably fine.
+            if key.startswith(name):
+                local_key = key[len(name):]
+                if &#39;.&#39; not in local_key:
+                    module._streaming_state[local_key] = value
+                    del state[key]
+
+    self._apply_named_streaming(_set)
+    assert len(state) == 0, list(state.keys())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.streaming"><code class="name flex">
+<span>def <span class="ident">streaming</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Context manager to enter streaming mode. Reset streaming state on exit.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def streaming(self):
+    &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.&#34;&#34;&#34;
+    self._set_streaming(True)
+    try:
+        yield
+    finally:
+        self._set_streaming(False)
+        self.reset_streaming()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingSequential"><code class="flex name class">
+<span>class <span class="ident">StreamingSequential</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>A streaming compatible alternative of <code>nn.Sequential</code>.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingSequential(StreamingModule, nn.Sequential):
+    &#34;&#34;&#34;A streaming compatible alternative of `nn.Sequential`.
+    &#34;&#34;&#34;
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        for module in self:
+            if isinstance(module, StreamingModule):
+                x = module.flush(x)
+            elif x is not None:
+                x = module(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.container.Sequential</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.call_super_init" href="#audiocraft.modules.streaming.StreamingModule.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.dump_patches" href="#audiocraft.modules.streaming.StreamingModule.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.training" href="#audiocraft.modules.streaming.StreamingModule.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/modules/transformer.html b/api_docs/audiocraft/modules/transformer.html
new file mode 100644
index 00000000..5ab4e1dd
--- /dev/null
+++ b/api_docs/audiocraft/modules/transformer.html
@@ -0,0 +1,2013 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.transformer API documentation</title>
+<meta name="description" content="Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.transformer</code></h1>
+</header>
+<section id="section-intro">
+<p>Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field.</p>
+<p>See <code><a title="audiocraft.modules.transformer.StreamingTransformer" href="#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></code> for more information.</p>
+<p>Unlike regular PyTorch Transformer, we make the hard choice that batches are first.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field.
+
+See `StreamingTransformer` for more information.
+
+Unlike regular PyTorch Transformer, we make the hard choice that batches are first.
+&#34;&#34;&#34;
+
+import typing as tp
+
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
+from xformers import ops
+
+from .rope import RotaryEmbedding
+from .streaming import StreamingModule
+
+_efficient_attention_backend: str = &#39;torch&#39;
+
+
+def set_efficient_attention_backend(backend: str = &#39;torch&#39;):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in [&#39;xformers&#39;, &#39;torch&#39;]
+    _efficient_attention_backend = backend
+
+
+def _get_attention_time_dimension(memory_efficient: bool) -&gt; int:
+    if _efficient_attention_backend == &#39;torch&#39; and memory_efficient:
+        return 2
+    else:
+        return 1
+
+
+def _is_profiled() -&gt; bool:
+    # Return true if we are currently running with a xformers profiler activated.
+    try:
+        from xformers.profiler import profiler
+    except ImportError:
+        return False
+    return profiler._Profiler._CURRENT_PROFILER is not None
+
+
+def create_norm_fn(norm_type: str, dim: int, **kwargs) -&gt; nn.Module:
+    &#34;&#34;&#34;Create normalization module for transformer encoder layer.
+
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    &#34;&#34;&#34;
+    if norm_type == &#39;layer_norm&#39;:
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    else:
+        raise ValueError(f&#34;Unknown norm type: {norm_type}&#34;)
+
+
+def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
+                         dtype: torch.dtype = torch.float32) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Create sinusoidal positional embedding, with shape `[B, T, C]`.
+
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    &#34;&#34;&#34;
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+
+
+def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -&gt; torch.Tensor:
+    &#34;&#34;&#34;torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers.&#34;&#34;&#34;
+    if n_rep == 1:
+        return x
+    if _efficient_attention_backend == &#39;torch&#39; and memory_efficient:
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )
+
+
+class LayerScale(nn.Module):
+    &#34;&#34;&#34;Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or str, optional): Device on which to initialize the module.
+        dtype (torch.dtype, optional): dtype to use to initialize the module.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
+                 device=None, dtype=None):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full((channels,), init,
+                       requires_grad=True, device=device, dtype=dtype))
+
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x
+
+
+class StreamingMultiheadAttention(StreamingModule):
+    &#34;&#34;&#34;Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout level.
+        bias (bool): Use bias in projections.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        cross_attention: Should be true when used as a cross attention.
+            All keys and values must be available at once, streaming is only for the queries.
+            Cannot be used with `causal` or `rope` (as it wouldn&#39;t make sens to
+            interpret the time steps in the keys relative to those in the queries).
+        safe_streaming (bool): Bug fix, will go away with xformers update.
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    &#34;&#34;&#34;
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
+                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
+                 device=None, dtype=None):
+        super().__init__()
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        if past_context is not None:
+            assert causal
+
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.past_context = past_context
+        self.memory_efficient = memory_efficient
+        self.attention_as_float32 = attention_as_float32
+        self.rope = rope
+        self.cross_attention = cross_attention
+        self.safe_streaming = safe_streaming
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.kv_repeat = kv_repeat
+        if cross_attention:
+            assert not causal, &#34;Causal cannot work with cross attention.&#34;
+            assert rope is None, &#34;Rope cannot work with cross attention.&#34;
+
+        if memory_efficient:
+            _verify_xformers_memory_efficient_compat()
+
+        self.custom = _is_custom(custom, memory_efficient)
+        if self.custom:
+            out_dim = embed_dim
+            assert num_heads % kv_repeat == 0
+            assert not cross_attention or kv_repeat == 1
+            num_kv = num_heads // kv_repeat
+            kv_dim = (embed_dim // num_heads) * num_kv
+            out_dim += 2 * kv_dim
+            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
+            # We try to follow the default PyTorch MHA convention, to easily compare results.
+            self.in_proj_weight = in_proj.weight
+            self.in_proj_bias = in_proj.bias
+            if bias:
+                self.in_proj_bias.data.zero_()  # Following Pytorch convention
+            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+            if bias:
+                self.out_proj.bias.data.zero_()
+        else:
+            assert not qk_layer_norm
+            assert kv_repeat == 1
+            self.mha = nn.MultiheadAttention(
+                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
+                **factory_kwargs)
+        self.qk_layer_norm = qk_layer_norm
+        if qk_layer_norm:
+            assert self.custom
+            assert kv_repeat == 1
+            ln_dim = embed_dim
+            self.q_layer_norm = nn.LayerNorm(ln_dim)
+            self.k_layer_norm = nn.LayerNorm(ln_dim)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if not self.custom:
+            # Support compat with regular MHA
+            keys = [n for n, _ in self.mha.named_parameters()]
+            for key in keys:
+                if prefix + key in state_dict:
+                    state_dict[prefix + &#34;mha.&#34; + key] = state_dict.pop(prefix + key)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
+        # Return a causal mask, accounting for potentially stored past keys/values
+        # We actually return a bias for the attention score, as this has the same
+        # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.memory_efficient:
+            from xformers.ops import LowerTriangularMask
+            if current_steps == 1:
+                # If we only have one step, then we do not need a mask.
+                return None
+            elif &#39;past_keys&#39; in self._streaming_state:
+                raise RuntimeError(&#34;Not supported at the moment&#34;)
+            else:
+                # Then we can safely use a lower triangular mask
+                return LowerTriangularMask()
+        if self._streaming_state:
+            past_keys = self._streaming_state[&#39;past_keys&#39;]
+            past_steps = past_keys.shape[time_dim]
+        else:
+            past_steps = 0
+
+        queries_pos = torch.arange(
+            past_steps, current_steps + past_steps, device=device).view(-1, 1)
+        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = delta &gt;= 0
+        if self.past_context is not None:
+            valid &amp;= (delta &lt;= self.past_context)
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float(&#39;-inf&#39;), device=device, dtype=dtype))
+
+    def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.cross_attention:
+            # With cross attention we assume all keys and values
+            # are already available, and streaming is with respect
+            # to the queries only.
+            return k, v
+        # Complete the key/value pair using the streaming state.
+        if self._streaming_state:
+            pk = self._streaming_state[&#39;past_keys&#39;]
+            nk = torch.cat([pk, k], dim=time_dim)
+            if v is k:
+                nv = nk
+            else:
+                pv = self._streaming_state[&#39;past_values&#39;]
+                nv = torch.cat([pv, v], dim=time_dim)
+        else:
+            nk = k
+            nv = v
+
+        assert nk.shape[time_dim] == nv.shape[time_dim]
+        offset = 0
+        if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
+        if self._is_streaming:
+            self._streaming_state[&#39;past_keys&#39;] = nk[:, offset:]
+            if v is not k:
+                self._streaming_state[&#39;past_values&#39;] = nv[:, offset:]
+            if &#39;offset&#39; in self._streaming_state:
+                self._streaming_state[&#39;offset&#39;] += offset
+            else:
+                self._streaming_state[&#39;offset&#39;] = torch.tensor(0)
+        return nk, nv
+
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        # Apply rope embeddings to query and key tensors.
+        assert self.rope is not None
+        if &#39;past_keys&#39; in self._streaming_state:
+            past_keys_offset = self._streaming_state[&#39;past_keys&#39;].shape[1]
+        else:
+            past_keys_offset = 0
+        if &#39;offset&#39; in self._streaming_state:
+            past_context_offset = int(self._streaming_state[&#39;offset&#39;].item())
+        else:
+            past_context_offset = 0
+        streaming_offset = past_context_offset + past_keys_offset
+        return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+                key_padding_mask=None, need_weights=False, attn_mask=None,
+                average_attn_weights=True, is_causal=False):
+        assert attn_mask is None
+        assert not is_causal, (&#34;New param added in torch 2.0.1 not supported, &#34;
+                               &#34;use the causal args in the constructor.&#34;)
+
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if time_dim == 2:
+            layout = &#34;b h t d&#34;
+        else:
+            layout = &#34;b t h d&#34;
+        dtype = query.dtype
+        if self._is_streaming:
+            assert self.causal or self.cross_attention, \
+                &#34;Streaming only available for causal or cross attention&#34;
+
+        if self.causal:
+            # At the moment we specialize only for the self-attention case.
+            assert query.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            assert value.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
+
+        if self.custom:
+            # custom implementation
+            assert need_weights is False
+            assert key_padding_mask is None
+            if self.cross_attention:
+                # Different queries, keys, values, we have to spit manually the weights
+                # before applying the linear.
+                dim = self.in_proj_weight.shape[0] // 3
+                if self.in_proj_bias is None:
+                    bias_q, bias_k, bias_v = None, None, None
+                else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
+                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
+                # todo: when streaming, we could actually save k, v and check the shape actually match.
+                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
+                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
+                if self.qk_layer_norm is True:
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k, v]]
+            else:
+                if not _is_profiled():
+                    # profiling breaks that propertysomehow.
+                    assert query is key, &#34;specialized implementation&#34;
+                    assert value is key, &#34;specialized implementation&#34;
+                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
+                if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = &#34;b h p t d&#34;
+                    else:
+                        bound_layout = &#34;b t p h d&#34;
+                    packed = rearrange(projected, f&#34;b t (p h d) -&gt; {bound_layout}&#34;, p=3, h=self.num_heads)
+                    q, k, v = ops.unbind(packed, dim=2)
+                else:
+                    embed_dim = self.embed_dim
+                    per_head_dim = (embed_dim // self.num_heads)
+                    kv_heads = self.num_heads // self.kv_repeat
+                    q = projected[:, :, :embed_dim]
+                    start = embed_dim
+                    end = start + per_head_dim * kv_heads
+                    k = projected[:, :, start: end]
+                    v = projected[:, :, end:]
+                    q = rearrange(q, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads)
+                    k = rearrange(k, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+                    v = rearrange(v, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+
+                if self.qk_layer_norm is True:
+                    assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;) for x in [q, k]]
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k]]
+                if self.rope:
+                    q, k = self._apply_rope(q, k)
+                k, v = self._complete_kv(k, v)
+                if self.kv_repeat &gt; 1:
+                    k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
+                    v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
+            if self.memory_efficient:
+                p = self.dropout if self.training else 0
+                if _efficient_attention_backend == &#39;torch&#39;:
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
+            else:
+                # We include the dot product as float32, for consistency
+                # with the other implementations that include that step
+                # as part of the attention. Note that when using `autocast`,
+                # the einsums would be done as bfloat16, but the softmax
+                # would be done as bfloat16, so `attention_as_float32` will
+                # extend a bit the range of operations done in float32,
+                # although this should make no difference.
+                q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace(&#39;t&#39;, &#39;k&#39;)
+                query_layout = layout
+                if self._is_streaming and self.safe_streaming and q.device.type == &#39;cuda&#39;:
+                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                else:
+                    pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                if attn_mask is not None:
+                    pre_w = pre_w + attn_mask
+                w = torch.softmax(pre_w, dim=-1)
+                w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f&#34;b h t k, {key_layout} -&gt; {layout}&#34;, w, v)
+            x = x.to(dtype)
+            x = rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;, h=self.num_heads)
+            x = self.out_proj(x)
+        else:
+            key, value = self._complete_kv(key, value)
+            if self.attention_as_float32:
+                query, key, value = [x.float() for x in [query, key, value]]
+            x, _ = self.mha(
+                query, key, value, key_padding_mask,
+                need_weights, attn_mask, average_attn_weights)
+            x = x.to(dtype)
+
+        return x, None
+
+
+class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    &#34;&#34;&#34;TransformerLayer with Streaming / Causal support.
+    This also integrates cross_attention, when passing `cross_attention=True`,
+    rather than having two separate classes like in PyTorch.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
+        qk_layer_norm_cross (bool): Same for the cross attention.
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+            Cross attention will use the default MHA, as it typically won&#39;t require
+            special treatment.
+        layer_scale (float, optional): If not None, LayerScale will be used with
+            the given value as initial scale.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        attention_dropout (float, optional): If not None, separate the value of the dimension dropout
+            in FFN and of the attention dropout.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
+                 past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
+                 kv_repeat: int = 1, norm: str = &#39;layer_norm&#39;, device=None, dtype=None, **kwargs):
+        super().__init__(d_model, num_heads, dim_feedforward, dropout,
+                         device=device, dtype=dtype, batch_first=True, **kwargs)
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            &#39;embed_dim&#39;: d_model,
+            &#39;num_heads&#39;: num_heads,
+            &#39;dropout&#39;: dropout if attention_dropout is None else attention_dropout,
+            &#39;bias&#39;: bias_attn,
+            &#39;custom&#39;: custom,
+            &#39;memory_efficient&#39;: memory_efficient,
+            &#39;attention_as_float32&#39;: attention_as_float32,
+        }
+        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
+            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
+        # Redefine feedforward layers to expose bias parameter
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
+
+        self.cross_attention: tp.Optional[nn.Module] = None
+        if cross_attention:
+            self.cross_attention = StreamingMultiheadAttention(
+                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
+                **attn_kwargs, **factory_kwargs)
+            # Norm and dropout
+            self.dropout_cross = nn.Dropout(dropout)
+            # eps value matching that used in PyTorch reference implementation.
+            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
+            self.layer_scale_cross: nn.Module
+            if layer_scale is None:
+                self.layer_scale_cross = nn.Identity()
+            else:
+                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
+        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+
+    def _cross_attention_block(self, src: torch.Tensor,
+                               cross_attention_src: torch.Tensor) -&gt; torch.Tensor:
+        assert self.cross_attention is not None
+        # queries are from src, keys and values from cross_attention_src.
+        x = self.cross_attention(
+            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
+        return self.dropout_cross(x)  # type: ignore
+
+    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+                cross_attention_src: tp.Optional[torch.Tensor] = None):
+        if self.cross_attention is None:
+            assert cross_attention_src is None
+        else:
+            assert cross_attention_src is not None
+        x = src
+        if self.norm_first:
+            x = x + self.layer_scale_1(
+                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+            if cross_attention_src is not None:
+                x = x + self.layer_scale_cross(
+                    self._cross_attention_block(
+                        self.norm_cross(x), cross_attention_src))
+            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+        else:
+            x = self.norm1(x + self.layer_scale_1(
+                self._sa_block(x, src_mask, src_key_padding_mask)))
+            if cross_attention_src is not None:
+                x = self.norm_cross(
+                    x + self.layer_scale_cross(
+                        self._cross_attention_block(src, cross_attention_src)))
+            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+        return x
+
+
+class StreamingTransformer(StreamingModule):
+    &#34;&#34;&#34;Transformer with Streaming / Causal support.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+        layer_scale (float, optional): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
+        lr (float, optional): learning rate override through the `make_optim_group` API.
+        weight_decay (float, optional): Weight_decay override through the `make_optim_group` API.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of AudioCraft.
+        checkpointing (str): Checkpointing strategy to reduce memory usage.
+            No checkpointing if set to &#39;none&#39;. Per layer checkpointing using PyTorch
+            if set to &#39;torch&#39; (entire layer checkpointed, i.e. linears are evaluated twice,
+            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
+            a policy for opting-out some operations of the checkpointing like
+            linear layers and attention, providing a middle ground between speed and memory.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
+                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None,
+                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 positional_embedding: str = &#39;sin&#39;, max_period: float = 10_000, positional_scale: float = 1.,
+                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
+                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+                 checkpointing: str = &#39;none&#39;, device=None, dtype=None, **kwargs):
+        super().__init__()
+        assert d_model % num_heads == 0
+
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.weight_decay = weight_decay
+        self.lr = lr
+
+        assert positional_embedding in [&#39;sin&#39;, &#39;rope&#39;, &#39;sin_rope&#39;]
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in [&#39;rope&#39;, &#39;sin_rope&#39;]:
+            assert _is_custom(custom, memory_efficient)
+            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
+                                        xpos=xpos, scale=positional_scale, device=device)
+
+        self.checkpointing = checkpointing
+
+        assert checkpointing in [&#39;none&#39;, &#39;torch&#39;, &#39;xformers_default&#39;, &#39;xformers_mm&#39;]
+        if self.checkpointing.startswith(&#39;xformers&#39;):
+            _verify_xformers_internal_compat()
+
+        self.layers = nn.ModuleList()
+        for idx in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
+                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
+                    causal=causal, past_context=past_context, custom=custom,
+                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
+                    device=device, dtype=dtype, **kwargs))
+
+        if self.checkpointing != &#39;none&#39;:
+            for layer in self.layers:
+                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
+                # backward hook inside of FSDP...
+                layer._magma_checkpointed = True  # type: ignore
+
+    def _apply_layer(self, layer, *args, **kwargs):
+        method = self.checkpointing
+        if method == &#39;none&#39;:
+            return layer(*args, **kwargs)
+        elif method == &#39;torch&#39;:
+            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        elif method.startswith(&#39;xformers&#39;):
+            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+            if method == &#39;xformers_default&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;xformers.efficient_attention_forward_cutlass.default&#34;,
+                    &#34;xformers_flash.flash_fwd.default&#34;,
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            elif method == &#39;xformers_mm&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            else:
+                raise ValueError(f&#34;xformers checkpointing xformers policy {method} is not known.&#34;)
+            policy_fn = _get_default_policy(allow_list)
+            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        else:
+            raise ValueError(f&#34;Checkpointing method {method} is unknown.&#34;)
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
+
+        if self.positional_embedding in [&#39;sin&#39;, &#39;sin_rope&#39;]:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offsets.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + self.positional_scale * pos_emb
+
+        for layer in self.layers:
+            x = self._apply_layer(layer, x, *args, **kwargs)
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return x
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        if self.weight_decay is not None:
+            group[&#34;weight_decay&#34;] = self.weight_decay
+        return group
+
+
+# special attention related function
+
+def _verify_xformers_memory_efficient_compat():
+    try:
+        from xformers.ops import memory_efficient_attention, LowerTriangularMask  # noqa
+    except ImportError:
+        raise ImportError(
+            &#34;xformers is not installed. Please install it and try again.\n&#34;
+            &#34;To install on AWS and Azure, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;8.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;
+            &#34;To install on FAIR Cluster, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;6.0;7.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;)
+
+
+def _verify_xformers_internal_compat():
+    try:
+        from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy  # noqa
+    except ImportError:
+        raise ImportError(
+            &#34;Francisco&#39;s fairinternal xformers is not installed. Please install it and try again.\n&#34;
+            &#34;To install on AWS and Azure, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;8.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;
+            &#34;To install on FAIR Cluster, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;6.0;7.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;)
+
+
+def _is_custom(custom: bool, memory_efficient: bool):
+    return custom or memory_efficient</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.transformer.create_norm_fn"><code class="name flex">
+<span>def <span class="ident">create_norm_fn</span></span>(<span>norm_type: str, dim: int, **kwargs) ‑> torch.nn.modules.module.Module</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create normalization module for transformer encoder layer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>norm_type</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the normalized layer.</dd>
+<dt><strong><code>**kwargs</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Additional parameters for normalization layer.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>nn.Module</code></dt>
+<dd>Normalization module.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def create_norm_fn(norm_type: str, dim: int, **kwargs) -&gt; nn.Module:
+    &#34;&#34;&#34;Create normalization module for transformer encoder layer.
+
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    &#34;&#34;&#34;
+    if norm_type == &#39;layer_norm&#39;:
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    else:
+        raise ValueError(f&#34;Unknown norm type: {norm_type}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.create_sin_embedding"><code class="name flex">
+<span>def <span class="ident">create_sin_embedding</span></span>(<span>positions: torch.Tensor, dim: int, max_period: float = 10000, dtype: torch.dtype = torch.float32) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create sinusoidal positional embedding, with shape <code>[B, T, C]</code>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>positions</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>LongTensor of positions.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the embedding.</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the cosine/sine functions.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>str</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sinusoidal positional embedding.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
+                         dtype: torch.dtype = torch.float32) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Create sinusoidal positional embedding, with shape `[B, T, C]`.
+
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    &#34;&#34;&#34;
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.expand_repeated_kv"><code class="name flex">
+<span>def <span class="ident">expand_repeated_kv</span></span>(<span>x: torch.Tensor, n_rep: int, memory_efficient: bool) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -&gt; torch.Tensor:
+    &#34;&#34;&#34;torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers.&#34;&#34;&#34;
+    if n_rep == 1:
+        return x
+    if _efficient_attention_backend == &#39;torch&#39; and memory_efficient:
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.set_efficient_attention_backend"><code class="name flex">
+<span>def <span class="ident">set_efficient_attention_backend</span></span>(<span>backend: str = 'torch')</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_efficient_attention_backend(backend: str = &#39;torch&#39;):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in [&#39;xformers&#39;, &#39;torch&#39;]
+    _efficient_attention_backend = backend</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale"><code class="flex name class">
+<span>class <span class="ident">LayerScale</span></span>
+<span>(</span><span>channels: int, init: float = 0.0001, channel_last: bool = True, device=None, dtype=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Layer scale from [Touvron et al 2021] (<a href="https://arxiv.org/pdf/2103.17239.pdf">https://arxiv.org/pdf/2103.17239.pdf</a>).
+This rescales diagonally the residual outputs close to 0, with a learnt scale.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of channels.</dd>
+<dt><strong><code>init</code></strong> :&ensp;<code>float</code></dt>
+<dd>Initial scale.</dd>
+<dt><strong><code>channel_last</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect <code>[*, C]</code> shaped tensors, otherwise, <code>[*, C, T]</code>.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code>, optional</dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code>, optional</dt>
+<dd>dtype to use to initialize the module.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LayerScale(nn.Module):
+    &#34;&#34;&#34;Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or str, optional): Device on which to initialize the module.
+        dtype (torch.dtype, optional): dtype to use to initialize the module.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
+                 device=None, dtype=None):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full((channels,), init,
+                       requires_grad=True, device=device, dtype=dtype))
+
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.LayerScale.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.LayerScale.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor):
+    if self.channel_last:
+        return self.scale * x
+    else:
+        return self.scale[:, None] * x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention"><code class="flex name class">
+<span>class <span class="ident">StreamingMultiheadAttention</span></span>
+<span>(</span><span>embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, rope: Optional[<a title="audiocraft.modules.rope.RotaryEmbedding" href="rope.html#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a>] = None, cross_attention: bool = False, safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1, device=None, dtype=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Similar to <code>nn.MultiheadAttention</code> but with support for streaming, causal evaluation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>embed_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension to project to.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout level.</dd>
+<dt><strong><code>bias</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias in projections.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt>rope (<code>RotaryEmbedding</code>, optional): Rope embedding to use.</dt>
+<dt><strong><code>cross_attention</code></strong></dt>
+<dd>Should be true when used as a cross attention.
+All keys and values must be available at once, streaming is only for the queries.
+Cannot be used with <code>causal</code> or <code>rope</code> (as it wouldn't make sens to
+interpret the time steps in the keys relative to those in the queries).</dd>
+<dt><strong><code>safe_streaming</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Bug fix, will go away with xformers update.</dd>
+<dt><strong><code>qk_layer_norm</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Layer normalization applied to queries and keys before dot product.</dd>
+<dt><strong><code>kv_repeat</code></strong> :&ensp;<code>int</code></dt>
+<dd>If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+This will lead to faster decoding time on A100 or other GPUs with tensorcore.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code>, optional</dt>
+<dd>Device on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code>, optional</dt>
+<dd>dtype to use.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingMultiheadAttention(StreamingModule):
+    &#34;&#34;&#34;Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout level.
+        bias (bool): Use bias in projections.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        cross_attention: Should be true when used as a cross attention.
+            All keys and values must be available at once, streaming is only for the queries.
+            Cannot be used with `causal` or `rope` (as it wouldn&#39;t make sens to
+            interpret the time steps in the keys relative to those in the queries).
+        safe_streaming (bool): Bug fix, will go away with xformers update.
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    &#34;&#34;&#34;
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
+                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
+                 device=None, dtype=None):
+        super().__init__()
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        if past_context is not None:
+            assert causal
+
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.past_context = past_context
+        self.memory_efficient = memory_efficient
+        self.attention_as_float32 = attention_as_float32
+        self.rope = rope
+        self.cross_attention = cross_attention
+        self.safe_streaming = safe_streaming
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.kv_repeat = kv_repeat
+        if cross_attention:
+            assert not causal, &#34;Causal cannot work with cross attention.&#34;
+            assert rope is None, &#34;Rope cannot work with cross attention.&#34;
+
+        if memory_efficient:
+            _verify_xformers_memory_efficient_compat()
+
+        self.custom = _is_custom(custom, memory_efficient)
+        if self.custom:
+            out_dim = embed_dim
+            assert num_heads % kv_repeat == 0
+            assert not cross_attention or kv_repeat == 1
+            num_kv = num_heads // kv_repeat
+            kv_dim = (embed_dim // num_heads) * num_kv
+            out_dim += 2 * kv_dim
+            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
+            # We try to follow the default PyTorch MHA convention, to easily compare results.
+            self.in_proj_weight = in_proj.weight
+            self.in_proj_bias = in_proj.bias
+            if bias:
+                self.in_proj_bias.data.zero_()  # Following Pytorch convention
+            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+            if bias:
+                self.out_proj.bias.data.zero_()
+        else:
+            assert not qk_layer_norm
+            assert kv_repeat == 1
+            self.mha = nn.MultiheadAttention(
+                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
+                **factory_kwargs)
+        self.qk_layer_norm = qk_layer_norm
+        if qk_layer_norm:
+            assert self.custom
+            assert kv_repeat == 1
+            ln_dim = embed_dim
+            self.q_layer_norm = nn.LayerNorm(ln_dim)
+            self.k_layer_norm = nn.LayerNorm(ln_dim)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if not self.custom:
+            # Support compat with regular MHA
+            keys = [n for n, _ in self.mha.named_parameters()]
+            for key in keys:
+                if prefix + key in state_dict:
+                    state_dict[prefix + &#34;mha.&#34; + key] = state_dict.pop(prefix + key)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
+        # Return a causal mask, accounting for potentially stored past keys/values
+        # We actually return a bias for the attention score, as this has the same
+        # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.memory_efficient:
+            from xformers.ops import LowerTriangularMask
+            if current_steps == 1:
+                # If we only have one step, then we do not need a mask.
+                return None
+            elif &#39;past_keys&#39; in self._streaming_state:
+                raise RuntimeError(&#34;Not supported at the moment&#34;)
+            else:
+                # Then we can safely use a lower triangular mask
+                return LowerTriangularMask()
+        if self._streaming_state:
+            past_keys = self._streaming_state[&#39;past_keys&#39;]
+            past_steps = past_keys.shape[time_dim]
+        else:
+            past_steps = 0
+
+        queries_pos = torch.arange(
+            past_steps, current_steps + past_steps, device=device).view(-1, 1)
+        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = delta &gt;= 0
+        if self.past_context is not None:
+            valid &amp;= (delta &lt;= self.past_context)
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float(&#39;-inf&#39;), device=device, dtype=dtype))
+
+    def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.cross_attention:
+            # With cross attention we assume all keys and values
+            # are already available, and streaming is with respect
+            # to the queries only.
+            return k, v
+        # Complete the key/value pair using the streaming state.
+        if self._streaming_state:
+            pk = self._streaming_state[&#39;past_keys&#39;]
+            nk = torch.cat([pk, k], dim=time_dim)
+            if v is k:
+                nv = nk
+            else:
+                pv = self._streaming_state[&#39;past_values&#39;]
+                nv = torch.cat([pv, v], dim=time_dim)
+        else:
+            nk = k
+            nv = v
+
+        assert nk.shape[time_dim] == nv.shape[time_dim]
+        offset = 0
+        if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
+        if self._is_streaming:
+            self._streaming_state[&#39;past_keys&#39;] = nk[:, offset:]
+            if v is not k:
+                self._streaming_state[&#39;past_values&#39;] = nv[:, offset:]
+            if &#39;offset&#39; in self._streaming_state:
+                self._streaming_state[&#39;offset&#39;] += offset
+            else:
+                self._streaming_state[&#39;offset&#39;] = torch.tensor(0)
+        return nk, nv
+
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        # Apply rope embeddings to query and key tensors.
+        assert self.rope is not None
+        if &#39;past_keys&#39; in self._streaming_state:
+            past_keys_offset = self._streaming_state[&#39;past_keys&#39;].shape[1]
+        else:
+            past_keys_offset = 0
+        if &#39;offset&#39; in self._streaming_state:
+            past_context_offset = int(self._streaming_state[&#39;offset&#39;].item())
+        else:
+            past_context_offset = 0
+        streaming_offset = past_context_offset + past_keys_offset
+        return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+                key_padding_mask=None, need_weights=False, attn_mask=None,
+                average_attn_weights=True, is_causal=False):
+        assert attn_mask is None
+        assert not is_causal, (&#34;New param added in torch 2.0.1 not supported, &#34;
+                               &#34;use the causal args in the constructor.&#34;)
+
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if time_dim == 2:
+            layout = &#34;b h t d&#34;
+        else:
+            layout = &#34;b t h d&#34;
+        dtype = query.dtype
+        if self._is_streaming:
+            assert self.causal or self.cross_attention, \
+                &#34;Streaming only available for causal or cross attention&#34;
+
+        if self.causal:
+            # At the moment we specialize only for the self-attention case.
+            assert query.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            assert value.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
+
+        if self.custom:
+            # custom implementation
+            assert need_weights is False
+            assert key_padding_mask is None
+            if self.cross_attention:
+                # Different queries, keys, values, we have to spit manually the weights
+                # before applying the linear.
+                dim = self.in_proj_weight.shape[0] // 3
+                if self.in_proj_bias is None:
+                    bias_q, bias_k, bias_v = None, None, None
+                else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
+                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
+                # todo: when streaming, we could actually save k, v and check the shape actually match.
+                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
+                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
+                if self.qk_layer_norm is True:
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k, v]]
+            else:
+                if not _is_profiled():
+                    # profiling breaks that propertysomehow.
+                    assert query is key, &#34;specialized implementation&#34;
+                    assert value is key, &#34;specialized implementation&#34;
+                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
+                if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = &#34;b h p t d&#34;
+                    else:
+                        bound_layout = &#34;b t p h d&#34;
+                    packed = rearrange(projected, f&#34;b t (p h d) -&gt; {bound_layout}&#34;, p=3, h=self.num_heads)
+                    q, k, v = ops.unbind(packed, dim=2)
+                else:
+                    embed_dim = self.embed_dim
+                    per_head_dim = (embed_dim // self.num_heads)
+                    kv_heads = self.num_heads // self.kv_repeat
+                    q = projected[:, :, :embed_dim]
+                    start = embed_dim
+                    end = start + per_head_dim * kv_heads
+                    k = projected[:, :, start: end]
+                    v = projected[:, :, end:]
+                    q = rearrange(q, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads)
+                    k = rearrange(k, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+                    v = rearrange(v, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+
+                if self.qk_layer_norm is True:
+                    assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;) for x in [q, k]]
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k]]
+                if self.rope:
+                    q, k = self._apply_rope(q, k)
+                k, v = self._complete_kv(k, v)
+                if self.kv_repeat &gt; 1:
+                    k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
+                    v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
+            if self.memory_efficient:
+                p = self.dropout if self.training else 0
+                if _efficient_attention_backend == &#39;torch&#39;:
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
+            else:
+                # We include the dot product as float32, for consistency
+                # with the other implementations that include that step
+                # as part of the attention. Note that when using `autocast`,
+                # the einsums would be done as bfloat16, but the softmax
+                # would be done as bfloat16, so `attention_as_float32` will
+                # extend a bit the range of operations done in float32,
+                # although this should make no difference.
+                q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace(&#39;t&#39;, &#39;k&#39;)
+                query_layout = layout
+                if self._is_streaming and self.safe_streaming and q.device.type == &#39;cuda&#39;:
+                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                else:
+                    pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                if attn_mask is not None:
+                    pre_w = pre_w + attn_mask
+                w = torch.softmax(pre_w, dim=-1)
+                w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f&#34;b h t k, {key_layout} -&gt; {layout}&#34;, w, v)
+            x = x.to(dtype)
+            x = rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;, h=self.num_heads)
+            x = self.out_proj(x)
+        else:
+            key, value = self._complete_kv(key, value)
+            if self.attention_as_float32:
+                query, key, value = [x.float() for x in [query, key, value]]
+            x, _ = self.mha(
+                query, key, value, key_padding_mask,
+                need_weights, attn_mask, average_attn_weights)
+            x = x.to(dtype)
+
+        return x, None</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="streaming.html#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer"><code class="flex name class">
+<span>class <span class="ident">StreamingTransformer</span></span>
+<span>(</span><span>d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048, dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, cross_attention: bool = False, layer_scale: Optional[float] = None, positional_embedding: str = 'sin', max_period: float = 10000, positional_scale: float = 1.0, xpos: bool = False, lr: Optional[float] = None, weight_decay: Optional[float] = None, layer_class: Type[<a title="audiocraft.modules.transformer.StreamingTransformerLayer" href="#audiocraft.modules.transformer.StreamingTransformerLayer">StreamingTransformerLayer</a>] = audiocraft.modules.transformer.StreamingTransformerLayer, checkpointing: str = 'none', device=None, dtype=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transformer with Streaming / Causal support.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>d_model</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the data.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dim_feedforward</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate dimension of FF module.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout both for MHA and FF.</dd>
+<dt><strong><code>bias_ff</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for FF.</dd>
+<dt><strong><code>bias_attn</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for MHA.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt><strong><code>cross_attention</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect to get secondary input for cross-attention.</dd>
+<dt><strong><code>layer_scale</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>If not None, LayerScale will be used
+with the given value as initial scale.</dd>
+<dt><strong><code>positional_embedding</code></strong> :&ensp;<code>str</code></dt>
+<dd>Positional embedding strategy (sin, rope, or sin_rope).</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the time embedding.</dd>
+<dt><strong><code>positional_scale</code></strong> :&ensp;<code>float</code></dt>
+<dd>Scale of positional embedding, set to 0 to deactivate.</dd>
+<dt><strong><code>xpos</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply xpos exponential decay to positional embedding (rope only).</dd>
+<dt><strong><code>lr</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>learning rate override through the <code>make_optim_group</code> API.</dd>
+<dt><strong><code>weight_decay</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Weight_decay override through the <code>make_optim_group</code> API.</dd>
+<dt><strong><code>layer_class</code></strong></dt>
+<dd>(subclass of `StreamingTransformerLayer): class to use
+to initialize the layers, allowing further customization outside of AudioCraft.</dd>
+<dt><strong><code>checkpointing</code></strong> :&ensp;<code>str</code></dt>
+<dd>Checkpointing strategy to reduce memory usage.
+No checkpointing if set to 'none'. Per layer checkpointing using PyTorch
+if set to 'torch' (entire layer checkpointed, i.e. linears are evaluated twice,
+minimal memory usage, but maximal runtime). Finally, <code>xformers_default</code> provide
+a policy for opting-out some operations of the checkpointing like
+linear layers and attention, providing a middle ground between speed and memory.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code>, optional</dt>
+<dd>Device on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code>, optional</dt>
+<dd>dtype to use.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>See <code>nn.TransformerEncoderLayer</code>.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingTransformer(StreamingModule):
+    &#34;&#34;&#34;Transformer with Streaming / Causal support.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+        layer_scale (float, optional): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
+        lr (float, optional): learning rate override through the `make_optim_group` API.
+        weight_decay (float, optional): Weight_decay override through the `make_optim_group` API.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of AudioCraft.
+        checkpointing (str): Checkpointing strategy to reduce memory usage.
+            No checkpointing if set to &#39;none&#39;. Per layer checkpointing using PyTorch
+            if set to &#39;torch&#39; (entire layer checkpointed, i.e. linears are evaluated twice,
+            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
+            a policy for opting-out some operations of the checkpointing like
+            linear layers and attention, providing a middle ground between speed and memory.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
+                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None,
+                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 positional_embedding: str = &#39;sin&#39;, max_period: float = 10_000, positional_scale: float = 1.,
+                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
+                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+                 checkpointing: str = &#39;none&#39;, device=None, dtype=None, **kwargs):
+        super().__init__()
+        assert d_model % num_heads == 0
+
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.weight_decay = weight_decay
+        self.lr = lr
+
+        assert positional_embedding in [&#39;sin&#39;, &#39;rope&#39;, &#39;sin_rope&#39;]
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in [&#39;rope&#39;, &#39;sin_rope&#39;]:
+            assert _is_custom(custom, memory_efficient)
+            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
+                                        xpos=xpos, scale=positional_scale, device=device)
+
+        self.checkpointing = checkpointing
+
+        assert checkpointing in [&#39;none&#39;, &#39;torch&#39;, &#39;xformers_default&#39;, &#39;xformers_mm&#39;]
+        if self.checkpointing.startswith(&#39;xformers&#39;):
+            _verify_xformers_internal_compat()
+
+        self.layers = nn.ModuleList()
+        for idx in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
+                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
+                    causal=causal, past_context=past_context, custom=custom,
+                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
+                    device=device, dtype=dtype, **kwargs))
+
+        if self.checkpointing != &#39;none&#39;:
+            for layer in self.layers:
+                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
+                # backward hook inside of FSDP...
+                layer._magma_checkpointed = True  # type: ignore
+
+    def _apply_layer(self, layer, *args, **kwargs):
+        method = self.checkpointing
+        if method == &#39;none&#39;:
+            return layer(*args, **kwargs)
+        elif method == &#39;torch&#39;:
+            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        elif method.startswith(&#39;xformers&#39;):
+            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+            if method == &#39;xformers_default&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;xformers.efficient_attention_forward_cutlass.default&#34;,
+                    &#34;xformers_flash.flash_fwd.default&#34;,
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            elif method == &#39;xformers_mm&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            else:
+                raise ValueError(f&#34;xformers checkpointing xformers policy {method} is not known.&#34;)
+            policy_fn = _get_default_policy(allow_list)
+            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        else:
+            raise ValueError(f&#34;Checkpointing method {method} is unknown.&#34;)
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
+
+        if self.positional_embedding in [&#39;sin&#39;, &#39;sin_rope&#39;]:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offsets.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + self.positional_scale * pos_emb
+
+        for layer in self.layers:
+            x = self._apply_layer(layer, x, *args, **kwargs)
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return x
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        if self.weight_decay is not None:
+            group[&#34;weight_decay&#34;] = self.weight_decay
+        return group</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.make_optim_group"><code class="name flex">
+<span>def <span class="ident">make_optim_group</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def make_optim_group(self):
+    group = {&#34;params&#34;: list(self.parameters())}
+    if self.lr is not None:
+        group[&#34;lr&#34;] = self.lr
+    if self.weight_decay is not None:
+        group[&#34;weight_decay&#34;] = self.weight_decay
+    return group</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="streaming.html#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer"><code class="flex name class">
+<span>class <span class="ident">StreamingTransformerLayer</span></span>
+<span>(</span><span>d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False, cross_attention: bool = False, layer_scale: Optional[float] = None, rope: Optional[<a title="audiocraft.modules.rope.RotaryEmbedding" href="rope.html#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a>] = None, attention_dropout: Optional[float] = None, kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>TransformerLayer with Streaming / Causal support.
+This also integrates cross_attention, when passing <code>cross_attention=True</code>,
+rather than having two separate classes like in PyTorch.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>d_model</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the data.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dim_feedforward</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate dimension of FF module.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout both for MHA and FF.</dd>
+<dt><strong><code>bias_ff</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for FF.</dd>
+<dt><strong><code>bias_attn</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for MHA.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt><strong><code>qk_layer_norm</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Layer normalization applied to queries and keys before dot product in attention.</dd>
+<dt><strong><code>qk_layer_norm_cross</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Same for the cross attention.</dd>
+<dt><strong><code>cross_attention</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect to get secondary input for cross-attention.
+Cross attention will use the default MHA, as it typically won't require
+special treatment.</dd>
+<dt><strong><code>layer_scale</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>If not None, LayerScale will be used with
+the given value as initial scale.</dd>
+<dt>rope (<code>RotaryEmbedding</code>, optional): Rope embedding to use.</dt>
+<dt><strong><code>attention_dropout</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>If not None, separate the value of the dimension dropout
+in FFN and of the attention dropout.</dd>
+<dt><strong><code>kv_repeat</code></strong> :&ensp;<code>int</code></dt>
+<dd>If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+This will lead to faster decoding time on A100 or other GPUs with tensorcore.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code>, optional</dt>
+<dd>Device on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code>, optional</dt>
+<dd>dtype to use.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>See <code>nn.TransformerEncoderLayer</code>.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    &#34;&#34;&#34;TransformerLayer with Streaming / Causal support.
+    This also integrates cross_attention, when passing `cross_attention=True`,
+    rather than having two separate classes like in PyTorch.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
+        qk_layer_norm_cross (bool): Same for the cross attention.
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+            Cross attention will use the default MHA, as it typically won&#39;t require
+            special treatment.
+        layer_scale (float, optional): If not None, LayerScale will be used with
+            the given value as initial scale.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        attention_dropout (float, optional): If not None, separate the value of the dimension dropout
+            in FFN and of the attention dropout.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
+                 past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
+                 kv_repeat: int = 1, norm: str = &#39;layer_norm&#39;, device=None, dtype=None, **kwargs):
+        super().__init__(d_model, num_heads, dim_feedforward, dropout,
+                         device=device, dtype=dtype, batch_first=True, **kwargs)
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            &#39;embed_dim&#39;: d_model,
+            &#39;num_heads&#39;: num_heads,
+            &#39;dropout&#39;: dropout if attention_dropout is None else attention_dropout,
+            &#39;bias&#39;: bias_attn,
+            &#39;custom&#39;: custom,
+            &#39;memory_efficient&#39;: memory_efficient,
+            &#39;attention_as_float32&#39;: attention_as_float32,
+        }
+        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
+            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
+        # Redefine feedforward layers to expose bias parameter
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
+
+        self.cross_attention: tp.Optional[nn.Module] = None
+        if cross_attention:
+            self.cross_attention = StreamingMultiheadAttention(
+                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
+                **attn_kwargs, **factory_kwargs)
+            # Norm and dropout
+            self.dropout_cross = nn.Dropout(dropout)
+            # eps value matching that used in PyTorch reference implementation.
+            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
+            self.layer_scale_cross: nn.Module
+            if layer_scale is None:
+                self.layer_scale_cross = nn.Identity()
+            else:
+                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
+        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+
+    def _cross_attention_block(self, src: torch.Tensor,
+                               cross_attention_src: torch.Tensor) -&gt; torch.Tensor:
+        assert self.cross_attention is not None
+        # queries are from src, keys and values from cross_attention_src.
+        x = self.cross_attention(
+            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
+        return self.dropout_cross(x)  # type: ignore
+
+    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+                cross_attention_src: tp.Optional[torch.Tensor] = None):
+        if self.cross_attention is None:
+            assert cross_attention_src is None
+        else:
+            assert cross_attention_src is not None
+        x = src
+        if self.norm_first:
+            x = x + self.layer_scale_1(
+                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+            if cross_attention_src is not None:
+                x = x + self.layer_scale_cross(
+                    self._cross_attention_block(
+                        self.norm_cross(x), cross_attention_src))
+            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+        else:
+            x = self.norm1(x + self.layer_scale_1(
+                self._sa_block(x, src_mask, src_key_padding_mask)))
+            if cross_attention_src is not None:
+                x = self.norm_cross(
+                    x + self.layer_scale_cross(
+                        self._cross_attention_block(src, cross_attention_src)))
+            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.transformer.TransformerEncoderLayer</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None, src_key_padding_mask: Optional[torch.Tensor] = None, cross_attention_src: Optional[torch.Tensor] = None) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Pass the input through the encoder layer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>src</code></strong></dt>
+<dd>the sequence to the encoder layer (required).</dd>
+<dt><strong><code>src_mask</code></strong></dt>
+<dd>the mask for the src sequence (optional).</dd>
+<dt><strong><code>src_key_padding_mask</code></strong></dt>
+<dd>the mask for the src keys per batch (optional).</dd>
+<dt><strong><code>is_causal</code></strong></dt>
+<dd>If specified, applies a causal mask as <code>src mask</code>.
+Default: <code>False</code>.
+Warning:
+<code>is_causal</code> provides a hint that <code>src_mask</code> is the
+causal mask. Providing incorrect hints can result in
+incorrect execution, including forward and backward
+compatibility.</dd>
+</dl>
+<h2 id="shape">Shape</h2>
+<p>see the docs in Transformer class.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+            src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+            cross_attention_src: tp.Optional[torch.Tensor] = None):
+    if self.cross_attention is None:
+        assert cross_attention_src is None
+    else:
+        assert cross_attention_src is not None
+    x = src
+    if self.norm_first:
+        x = x + self.layer_scale_1(
+            self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+        if cross_attention_src is not None:
+            x = x + self.layer_scale_cross(
+                self._cross_attention_block(
+                    self.norm_cross(x), cross_attention_src))
+        x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+    else:
+        x = self.norm1(x + self.layer_scale_1(
+            self._sa_block(x, src_mask, src_key_padding_mask)))
+        if cross_attention_src is not None:
+            x = self.norm_cross(
+                x + self.layer_scale_cross(
+                    self._cross_attention_block(src, cross_attention_src)))
+        x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.create_norm_fn" href="#audiocraft.modules.transformer.create_norm_fn">create_norm_fn</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.create_sin_embedding" href="#audiocraft.modules.transformer.create_sin_embedding">create_sin_embedding</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.expand_repeated_kv" href="#audiocraft.modules.transformer.expand_repeated_kv">expand_repeated_kv</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.set_efficient_attention_backend" href="#audiocraft.modules.transformer.set_efficient_attention_backend">set_efficient_attention_backend</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.LayerScale" href="#audiocraft.modules.transformer.LayerScale">LayerScale</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.LayerScale.call_super_init" href="#audiocraft.modules.transformer.LayerScale.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.dump_patches" href="#audiocraft.modules.transformer.LayerScale.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.forward" href="#audiocraft.modules.transformer.LayerScale.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.training" href="#audiocraft.modules.transformer.LayerScale.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention" href="#audiocraft.modules.transformer.StreamingMultiheadAttention">StreamingMultiheadAttention</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.training" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingTransformer" href="#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.call_super_init" href="#audiocraft.modules.transformer.StreamingTransformer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.dump_patches" href="#audiocraft.modules.transformer.StreamingTransformer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.make_optim_group" href="#audiocraft.modules.transformer.StreamingTransformer.make_optim_group">make_optim_group</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.training" href="#audiocraft.modules.transformer.StreamingTransformer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer" href="#audiocraft.modules.transformer.StreamingTransformerLayer">StreamingTransformerLayer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init" href="#audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches" href="#audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.forward" href="#audiocraft.modules.transformer.StreamingTransformerLayer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.training" href="#audiocraft.modules.transformer.StreamingTransformerLayer.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/cosine_lr_scheduler.html b/api_docs/audiocraft/optim/cosine_lr_scheduler.html
new file mode 100644
index 00000000..5a35dae1
--- /dev/null
+++ b/api_docs/audiocraft/optim/cosine_lr_scheduler.html
@@ -0,0 +1,201 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.cosine_lr_scheduler API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.cosine_lr_scheduler</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class CosineLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Cosine LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        lr_min_ratio (float): Minimum learning rate.
+        cycle_length (float): Cycle length.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, total_steps: int, warmup_steps: int,
+                 lr_min_ratio: float = 0.0, cycle_length: float = 1.0):
+        self.warmup_steps = warmup_steps
+        assert self.warmup_steps &gt;= 0
+        self.total_steps = total_steps
+        assert self.total_steps &gt;= 0
+        self.lr_min_ratio = lr_min_ratio
+        self.cycle_length = cycle_length
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            lr_ratio = step / self.warmup_steps
+            lr = lr_ratio * lr
+        elif step &lt;= self.total_steps:
+            s = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+            lr_ratio = self.lr_min_ratio + 0.5 * (1 - self.lr_min_ratio) * \
+                (1. + math.cos(math.pi * s / self.cycle_length))
+            lr = lr_ratio * lr
+        else:
+            lr_ratio = self.lr_min_ratio
+            lr = lr_ratio * lr
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs]</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler"><code class="flex name class">
+<span>class <span class="ident">CosineLRScheduler</span></span>
+<span>(</span><span>optimizer: torch.optim.optimizer.Optimizer, total_steps: int, warmup_steps: int, lr_min_ratio: float = 0.0, cycle_length: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Cosine LR scheduler.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>Optimizer</code></dt>
+<dd>Torch optimizer.</dd>
+<dt><strong><code>warmup_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of warmup steps.</dd>
+<dt><strong><code>total_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total number of steps.</dd>
+<dt><strong><code>lr_min_ratio</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum learning rate.</dd>
+<dt><strong><code>cycle_length</code></strong> :&ensp;<code>float</code></dt>
+<dd>Cycle length.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CosineLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Cosine LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        lr_min_ratio (float): Minimum learning rate.
+        cycle_length (float): Cycle length.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, total_steps: int, warmup_steps: int,
+                 lr_min_ratio: float = 0.0, cycle_length: float = 1.0):
+        self.warmup_steps = warmup_steps
+        assert self.warmup_steps &gt;= 0
+        self.total_steps = total_steps
+        assert self.total_steps &gt;= 0
+        self.lr_min_ratio = lr_min_ratio
+        self.cycle_length = cycle_length
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            lr_ratio = step / self.warmup_steps
+            lr = lr_ratio * lr
+        elif step &lt;= self.total_steps:
+            s = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+            lr_ratio = self.lr_min_ratio + 0.5 * (1 - self.lr_min_ratio) * \
+                (1. + math.cos(math.pi * s / self.cycle_length))
+            lr = lr_ratio * lr
+        else:
+            lr_ratio = self.lr_min_ratio
+            lr = lr_ratio * lr
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.optim.lr_scheduler._LRScheduler</li>
+<li>torch.optim.lr_scheduler.LRScheduler</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler.get_lr"><code class="name flex">
+<span>def <span class="ident">get_lr</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lr(self):
+    return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler" href="#audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler">CosineLRScheduler</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler.get_lr" href="#audiocraft.optim.cosine_lr_scheduler.CosineLRScheduler.get_lr">get_lr</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/dadam.html b/api_docs/audiocraft/optim/dadam.html
new file mode 100644
index 00000000..72b5c5b6
--- /dev/null
+++ b/api_docs/audiocraft/optim/dadam.html
@@ -0,0 +1,823 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.dadam API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.dadam</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Any
+
+import torch
+import torch.optim
+import torch.distributed as dist
+
+
+logger = logging.getLogger(__name__)
+_params_t = Any
+
+
+def to_real(x):
+    if torch.is_complex(x):
+        return x.real
+    else:
+        return x
+
+
+class DAdaptAdam(torch.optim.Optimizer):
+    &#34;&#34;&#34;Adam with D-Adaptation automatic step-sizes.
+    Leave LR set to 1 unless you encounter instability.
+
+    Args:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float):
+            Learning rate adjustment parameter. Increases or decreases the D-adapted learning rate.
+        betas (tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        momentum (float):
+            Momentum value in  the range [0,1) (default: 0.9).
+        eps (float):
+            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-8).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        log_every (int):
+            Log using print every k steps, default 0 (no logging).
+        decouple (boolean):
+            Use AdamW style decoupled weight decay
+        d0 (float):
+            Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+        growth_rate (float):
+            prevent the D estimate from growing faster than this multiplicative rate.
+            Default is inf, for unrestricted. Values like 1.02 give a kind of learning
+            rate warmup effect.
+        fsdp_in_use (bool):
+            If you&#39;re using sharded parameters, this should be set to True. The optimizer
+            will attempt to auto-detect this, but if you&#39;re using an implementation other
+            than PyTorch&#39;s builtin version, the auto-detection won&#39;t work.
+    &#34;&#34;&#34;
+    def __init__(self, params, lr=1.0,
+                 betas=(0.9, 0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 log_every=0,
+                 decouple=True,
+                 d0=1e-6,
+                 growth_rate=float(&#39;inf&#39;)):
+        if not 0.0 &lt; d0:
+            raise ValueError(&#34;Invalid d0 value: {}&#34;.format(d0))
+        if not 0.0 &lt; lr:
+            raise ValueError(&#34;Invalid learning rate: {}&#34;.format(lr))
+        if not 0.0 &lt; eps:
+            raise ValueError(&#34;Invalid epsilon value: {}&#34;.format(eps))
+        if not 0.0 &lt;= betas[0] &lt; 1.0:
+            raise ValueError(&#34;Invalid beta parameter at index 0: {}&#34;.format(betas[0]))
+        if not 0.0 &lt;= betas[1] &lt; 1.0:
+            raise ValueError(&#34;Invalid beta parameter at index 1: {}&#34;.format(betas[1]))
+
+        if decouple:
+            logger.info(&#34;Using decoupled weight decay&#34;)
+
+        from .fsdp import is_fsdp_used
+        fsdp_in_use = is_fsdp_used()
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        d=d0,
+                        k=0,
+                        gsq_weighted=0.0,
+                        log_every=log_every,
+                        decouple=decouple,
+                        growth_rate=growth_rate,
+                        fsdp_in_use=fsdp_in_use)
+
+        super().__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return False
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        &#34;&#34;&#34;Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        &#34;&#34;&#34;
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        g_sq = 0.0
+        sksq_weighted = 0.0
+        sk_l1 = 0.0
+
+        lr = max(group[&#39;lr&#39;] for group in self.param_groups)
+
+        group = self.param_groups[0]
+        gsq_weighted = group[&#39;gsq_weighted&#39;]
+        d = group[&#39;d&#39;]
+        dlr = d*lr
+
+        growth_rate = group[&#39;growth_rate&#39;]
+        decouple = group[&#39;decouple&#39;]
+        fsdp_in_use = group[&#39;fsdp_in_use&#39;]
+        log_every = group[&#39;log_every&#39;]
+
+        beta1, beta2 = group[&#39;betas&#39;]
+
+        for group in self.param_groups:
+            group_lr = group[&#39;lr&#39;]
+            decay = group[&#39;weight_decay&#39;]
+            k = group[&#39;k&#39;]
+            eps = group[&#39;eps&#39;]
+
+            if group_lr not in [lr, 0.0]:
+                raise RuntimeError(&#34;Setting different lr values in different parameter &#34;
+                                   &#34;groups is only supported for values of 0&#34;)
+
+            for p in group[&#39;params&#39;]:
+                if p.grad is None:
+                    continue
+                if hasattr(p, &#34;_fsdp_flattened&#34;):
+                    fsdp_in_use = True
+                grad = p.grad.data
+
+                # Apply weight decay (coupled variant)
+                if decay != 0 and not decouple:
+                    grad.add_(p.data, alpha=decay)
+
+                state = self.state[p]
+
+                # State initialization
+                if &#39;step&#39; not in state:
+                    state[&#39;step&#39;] = 0
+                    state[&#39;s&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of gradient values
+                    state[&#39;exp_avg&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of squared gradient values
+                    state[&#39;exp_avg_sq&#39;] = torch.zeros_like(
+                        to_real(p.data), memory_format=torch.preserve_format).detach()
+
+                exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+                grad_grad = to_real(grad * grad.conj())
+
+                # Adam EMA updates
+                if group_lr &gt; 0:
+                    exp_avg.mul_(beta1).add_(grad, alpha=dlr*(1-beta1))
+                    exp_avg_sq.mul_(beta2).add_(grad_grad, alpha=1-beta2)
+
+                    denom = exp_avg_sq.sqrt().add_(eps)
+
+                    g_sq += grad_grad.div_(denom).sum().item()
+
+                    s = state[&#39;s&#39;]
+                    s.mul_(beta2).add_(grad, alpha=dlr*(1-beta2))
+                    sksq_weighted += to_real(s * s.conj()).div_(denom).sum().item()
+                    sk_l1 += s.abs().sum().item()
+
+            ######
+
+        gsq_weighted = beta2*gsq_weighted + g_sq*(dlr**2)*(1-beta2)
+        d_hat = d
+
+        # if we have not done any progres, return
+        # if we have any gradients available, will have sk_l1 &gt; 0 (unless \|g\|=0)
+        if sk_l1 == 0:
+            return loss
+
+        if lr &gt; 0.0:
+            if fsdp_in_use:
+                dist_tensor = torch.zeros(3, device=&#39;cuda&#39;)
+                dist_tensor[0] = sksq_weighted
+                dist_tensor[1] = gsq_weighted
+                dist_tensor[2] = sk_l1
+                dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
+                global_sksq_weighted = dist_tensor[0]
+                global_gsq_weighted = dist_tensor[1]
+                global_sk_l1 = dist_tensor[2]
+            else:
+                global_sksq_weighted = sksq_weighted
+                global_gsq_weighted = gsq_weighted
+                global_sk_l1 = sk_l1
+
+            d_hat = (global_sksq_weighted/(1-beta2) - global_gsq_weighted)/global_sk_l1
+            d = max(d, min(d_hat, d*growth_rate))
+
+        if log_every &gt; 0 and k % log_every == 0:
+            logger.info(
+                f&#34;(k={k}) dlr: {dlr:1.1e} d_hat: {d_hat:1.1e}, d: {d:1.8}. &#34;
+                f&#34;sksq_weighted={global_sksq_weighted:1.1e} gsq_weighted={global_gsq_weighted:1.1e} &#34;
+                f&#34;sk_l1={global_sk_l1:1.1e}{&#39; (FSDP)&#39; if fsdp_in_use else &#39;&#39;}&#34;)
+
+        for group in self.param_groups:
+            group[&#39;gsq_weighted&#39;] = gsq_weighted
+            group[&#39;d&#39;] = d
+
+            group_lr = group[&#39;lr&#39;]
+            decay = group[&#39;weight_decay&#39;]
+            k = group[&#39;k&#39;]
+            eps = group[&#39;eps&#39;]
+
+            for p in group[&#39;params&#39;]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+
+                state = self.state[p]
+
+                exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+                state[&#39;step&#39;] += 1
+
+                denom = exp_avg_sq.sqrt().add_(eps)
+                denom = denom.type(p.type())
+
+                # Apply weight decay (decoupled variant)
+                if decay != 0 and decouple and group_lr &gt; 0:
+                    p.data.add_(p.data, alpha=-decay * dlr)
+
+                # Take step
+                p.data.addcdiv_(exp_avg, denom, value=-1)
+
+            group[&#39;k&#39;] = k + 1
+
+        return loss</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.optim.dadam.to_real"><code class="name flex">
+<span>def <span class="ident">to_real</span></span>(<span>x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_real(x):
+    if torch.is_complex(x):
+        return x.real
+    else:
+        return x</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.dadam.DAdaptAdam"><code class="flex name class">
+<span>class <span class="ident">DAdaptAdam</span></span>
+<span>(</span><span>params, lr=1.0, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, log_every=0, decouple=True, d0=1e-06, growth_rate=inf)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Adam with D-Adaptation automatic step-sizes.
+Leave LR set to 1 unless you encounter instability.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt>params (iterable):</dt>
+<dt>Iterable of parameters to optimize or dicts defining parameter groups.</dt>
+<dt>lr (float):</dt>
+<dt>Learning rate adjustment parameter. Increases or decreases the D-adapted learning rate.</dt>
+<dt><strong><code>betas</code></strong> :&ensp;<code>tuple[float, float]</code>, optional</dt>
+<dd>coefficients used for computing
+running averages of gradient and its square (default: (0.9, 0.999))</dd>
+</dl>
+<p>momentum (float):
+Momentum value in
+the range [0,1) (default: 0.9).
+eps (float):
+Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-8).
+weight_decay (float):
+Weight decay, i.e. a L2 penalty (default: 0).
+log_every (int):
+Log using print every k steps, default 0 (no logging).
+decouple (boolean):
+Use AdamW style decoupled weight decay
+d0 (float):
+Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+growth_rate (float):
+prevent the D estimate from growing faster than this multiplicative rate.
+Default is inf, for unrestricted. Values like 1.02 give a kind of learning
+rate warmup effect.
+fsdp_in_use (bool):
+If you're using sharded parameters, this should be set to True. The optimizer
+will attempt to auto-detect this, but if you're using an implementation other
+than PyTorch's builtin version, the auto-detection won't work.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DAdaptAdam(torch.optim.Optimizer):
+    &#34;&#34;&#34;Adam with D-Adaptation automatic step-sizes.
+    Leave LR set to 1 unless you encounter instability.
+
+    Args:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float):
+            Learning rate adjustment parameter. Increases or decreases the D-adapted learning rate.
+        betas (tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        momentum (float):
+            Momentum value in  the range [0,1) (default: 0.9).
+        eps (float):
+            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-8).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        log_every (int):
+            Log using print every k steps, default 0 (no logging).
+        decouple (boolean):
+            Use AdamW style decoupled weight decay
+        d0 (float):
+            Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+        growth_rate (float):
+            prevent the D estimate from growing faster than this multiplicative rate.
+            Default is inf, for unrestricted. Values like 1.02 give a kind of learning
+            rate warmup effect.
+        fsdp_in_use (bool):
+            If you&#39;re using sharded parameters, this should be set to True. The optimizer
+            will attempt to auto-detect this, but if you&#39;re using an implementation other
+            than PyTorch&#39;s builtin version, the auto-detection won&#39;t work.
+    &#34;&#34;&#34;
+    def __init__(self, params, lr=1.0,
+                 betas=(0.9, 0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 log_every=0,
+                 decouple=True,
+                 d0=1e-6,
+                 growth_rate=float(&#39;inf&#39;)):
+        if not 0.0 &lt; d0:
+            raise ValueError(&#34;Invalid d0 value: {}&#34;.format(d0))
+        if not 0.0 &lt; lr:
+            raise ValueError(&#34;Invalid learning rate: {}&#34;.format(lr))
+        if not 0.0 &lt; eps:
+            raise ValueError(&#34;Invalid epsilon value: {}&#34;.format(eps))
+        if not 0.0 &lt;= betas[0] &lt; 1.0:
+            raise ValueError(&#34;Invalid beta parameter at index 0: {}&#34;.format(betas[0]))
+        if not 0.0 &lt;= betas[1] &lt; 1.0:
+            raise ValueError(&#34;Invalid beta parameter at index 1: {}&#34;.format(betas[1]))
+
+        if decouple:
+            logger.info(&#34;Using decoupled weight decay&#34;)
+
+        from .fsdp import is_fsdp_used
+        fsdp_in_use = is_fsdp_used()
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        d=d0,
+                        k=0,
+                        gsq_weighted=0.0,
+                        log_every=log_every,
+                        decouple=decouple,
+                        growth_rate=growth_rate,
+                        fsdp_in_use=fsdp_in_use)
+
+        super().__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return False
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        &#34;&#34;&#34;Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        &#34;&#34;&#34;
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        g_sq = 0.0
+        sksq_weighted = 0.0
+        sk_l1 = 0.0
+
+        lr = max(group[&#39;lr&#39;] for group in self.param_groups)
+
+        group = self.param_groups[0]
+        gsq_weighted = group[&#39;gsq_weighted&#39;]
+        d = group[&#39;d&#39;]
+        dlr = d*lr
+
+        growth_rate = group[&#39;growth_rate&#39;]
+        decouple = group[&#39;decouple&#39;]
+        fsdp_in_use = group[&#39;fsdp_in_use&#39;]
+        log_every = group[&#39;log_every&#39;]
+
+        beta1, beta2 = group[&#39;betas&#39;]
+
+        for group in self.param_groups:
+            group_lr = group[&#39;lr&#39;]
+            decay = group[&#39;weight_decay&#39;]
+            k = group[&#39;k&#39;]
+            eps = group[&#39;eps&#39;]
+
+            if group_lr not in [lr, 0.0]:
+                raise RuntimeError(&#34;Setting different lr values in different parameter &#34;
+                                   &#34;groups is only supported for values of 0&#34;)
+
+            for p in group[&#39;params&#39;]:
+                if p.grad is None:
+                    continue
+                if hasattr(p, &#34;_fsdp_flattened&#34;):
+                    fsdp_in_use = True
+                grad = p.grad.data
+
+                # Apply weight decay (coupled variant)
+                if decay != 0 and not decouple:
+                    grad.add_(p.data, alpha=decay)
+
+                state = self.state[p]
+
+                # State initialization
+                if &#39;step&#39; not in state:
+                    state[&#39;step&#39;] = 0
+                    state[&#39;s&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of gradient values
+                    state[&#39;exp_avg&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of squared gradient values
+                    state[&#39;exp_avg_sq&#39;] = torch.zeros_like(
+                        to_real(p.data), memory_format=torch.preserve_format).detach()
+
+                exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+                grad_grad = to_real(grad * grad.conj())
+
+                # Adam EMA updates
+                if group_lr &gt; 0:
+                    exp_avg.mul_(beta1).add_(grad, alpha=dlr*(1-beta1))
+                    exp_avg_sq.mul_(beta2).add_(grad_grad, alpha=1-beta2)
+
+                    denom = exp_avg_sq.sqrt().add_(eps)
+
+                    g_sq += grad_grad.div_(denom).sum().item()
+
+                    s = state[&#39;s&#39;]
+                    s.mul_(beta2).add_(grad, alpha=dlr*(1-beta2))
+                    sksq_weighted += to_real(s * s.conj()).div_(denom).sum().item()
+                    sk_l1 += s.abs().sum().item()
+
+            ######
+
+        gsq_weighted = beta2*gsq_weighted + g_sq*(dlr**2)*(1-beta2)
+        d_hat = d
+
+        # if we have not done any progres, return
+        # if we have any gradients available, will have sk_l1 &gt; 0 (unless \|g\|=0)
+        if sk_l1 == 0:
+            return loss
+
+        if lr &gt; 0.0:
+            if fsdp_in_use:
+                dist_tensor = torch.zeros(3, device=&#39;cuda&#39;)
+                dist_tensor[0] = sksq_weighted
+                dist_tensor[1] = gsq_weighted
+                dist_tensor[2] = sk_l1
+                dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
+                global_sksq_weighted = dist_tensor[0]
+                global_gsq_weighted = dist_tensor[1]
+                global_sk_l1 = dist_tensor[2]
+            else:
+                global_sksq_weighted = sksq_weighted
+                global_gsq_weighted = gsq_weighted
+                global_sk_l1 = sk_l1
+
+            d_hat = (global_sksq_weighted/(1-beta2) - global_gsq_weighted)/global_sk_l1
+            d = max(d, min(d_hat, d*growth_rate))
+
+        if log_every &gt; 0 and k % log_every == 0:
+            logger.info(
+                f&#34;(k={k}) dlr: {dlr:1.1e} d_hat: {d_hat:1.1e}, d: {d:1.8}. &#34;
+                f&#34;sksq_weighted={global_sksq_weighted:1.1e} gsq_weighted={global_gsq_weighted:1.1e} &#34;
+                f&#34;sk_l1={global_sk_l1:1.1e}{&#39; (FSDP)&#39; if fsdp_in_use else &#39;&#39;}&#34;)
+
+        for group in self.param_groups:
+            group[&#39;gsq_weighted&#39;] = gsq_weighted
+            group[&#39;d&#39;] = d
+
+            group_lr = group[&#39;lr&#39;]
+            decay = group[&#39;weight_decay&#39;]
+            k = group[&#39;k&#39;]
+            eps = group[&#39;eps&#39;]
+
+            for p in group[&#39;params&#39;]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+
+                state = self.state[p]
+
+                exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+                state[&#39;step&#39;] += 1
+
+                denom = exp_avg_sq.sqrt().add_(eps)
+                denom = denom.type(p.type())
+
+                # Apply weight decay (decoupled variant)
+                if decay != 0 and decouple and group_lr &gt; 0:
+                    p.data.add_(p.data, alpha=-decay * dlr)
+
+                # Take step
+                p.data.addcdiv_(exp_avg, denom, value=-1)
+
+            group[&#39;k&#39;] = k + 1
+
+        return loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.optim.optimizer.Optimizer</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.optim.dadam.DAdaptAdam.OptimizerPostHook"><code class="name">var <span class="ident">OptimizerPostHook</span> : typing_extensions.TypeAlias</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.optim.dadam.DAdaptAdam.OptimizerPreHook"><code class="name">var <span class="ident">OptimizerPreHook</span> : typing_extensions.TypeAlias</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.optim.dadam.DAdaptAdam.supports_flat_params"><code class="name">var <span class="ident">supports_flat_params</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def supports_flat_params(self):
+    return True</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.dadam.DAdaptAdam.supports_memory_efficient_fp16"><code class="name">var <span class="ident">supports_memory_efficient_fp16</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def supports_memory_efficient_fp16(self):
+    return False</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.dadam.DAdaptAdam.step"><code class="name flex">
+<span>def <span class="ident">step</span></span>(<span>self, closure=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Performs a single optimization step.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>closure</code></strong> :&ensp;<code>callable</code>, optional</dt>
+<dd>A closure that reevaluates the model
+and returns the loss.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def step(self, closure=None):
+    &#34;&#34;&#34;Performs a single optimization step.
+
+    Args:
+        closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+    &#34;&#34;&#34;
+    loss = None
+    if closure is not None:
+        loss = closure()
+
+    g_sq = 0.0
+    sksq_weighted = 0.0
+    sk_l1 = 0.0
+
+    lr = max(group[&#39;lr&#39;] for group in self.param_groups)
+
+    group = self.param_groups[0]
+    gsq_weighted = group[&#39;gsq_weighted&#39;]
+    d = group[&#39;d&#39;]
+    dlr = d*lr
+
+    growth_rate = group[&#39;growth_rate&#39;]
+    decouple = group[&#39;decouple&#39;]
+    fsdp_in_use = group[&#39;fsdp_in_use&#39;]
+    log_every = group[&#39;log_every&#39;]
+
+    beta1, beta2 = group[&#39;betas&#39;]
+
+    for group in self.param_groups:
+        group_lr = group[&#39;lr&#39;]
+        decay = group[&#39;weight_decay&#39;]
+        k = group[&#39;k&#39;]
+        eps = group[&#39;eps&#39;]
+
+        if group_lr not in [lr, 0.0]:
+            raise RuntimeError(&#34;Setting different lr values in different parameter &#34;
+                               &#34;groups is only supported for values of 0&#34;)
+
+        for p in group[&#39;params&#39;]:
+            if p.grad is None:
+                continue
+            if hasattr(p, &#34;_fsdp_flattened&#34;):
+                fsdp_in_use = True
+            grad = p.grad.data
+
+            # Apply weight decay (coupled variant)
+            if decay != 0 and not decouple:
+                grad.add_(p.data, alpha=decay)
+
+            state = self.state[p]
+
+            # State initialization
+            if &#39;step&#39; not in state:
+                state[&#39;step&#39;] = 0
+                state[&#39;s&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                # Exponential moving average of gradient values
+                state[&#39;exp_avg&#39;] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                # Exponential moving average of squared gradient values
+                state[&#39;exp_avg_sq&#39;] = torch.zeros_like(
+                    to_real(p.data), memory_format=torch.preserve_format).detach()
+
+            exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+            grad_grad = to_real(grad * grad.conj())
+
+            # Adam EMA updates
+            if group_lr &gt; 0:
+                exp_avg.mul_(beta1).add_(grad, alpha=dlr*(1-beta1))
+                exp_avg_sq.mul_(beta2).add_(grad_grad, alpha=1-beta2)
+
+                denom = exp_avg_sq.sqrt().add_(eps)
+
+                g_sq += grad_grad.div_(denom).sum().item()
+
+                s = state[&#39;s&#39;]
+                s.mul_(beta2).add_(grad, alpha=dlr*(1-beta2))
+                sksq_weighted += to_real(s * s.conj()).div_(denom).sum().item()
+                sk_l1 += s.abs().sum().item()
+
+        ######
+
+    gsq_weighted = beta2*gsq_weighted + g_sq*(dlr**2)*(1-beta2)
+    d_hat = d
+
+    # if we have not done any progres, return
+    # if we have any gradients available, will have sk_l1 &gt; 0 (unless \|g\|=0)
+    if sk_l1 == 0:
+        return loss
+
+    if lr &gt; 0.0:
+        if fsdp_in_use:
+            dist_tensor = torch.zeros(3, device=&#39;cuda&#39;)
+            dist_tensor[0] = sksq_weighted
+            dist_tensor[1] = gsq_weighted
+            dist_tensor[2] = sk_l1
+            dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
+            global_sksq_weighted = dist_tensor[0]
+            global_gsq_weighted = dist_tensor[1]
+            global_sk_l1 = dist_tensor[2]
+        else:
+            global_sksq_weighted = sksq_weighted
+            global_gsq_weighted = gsq_weighted
+            global_sk_l1 = sk_l1
+
+        d_hat = (global_sksq_weighted/(1-beta2) - global_gsq_weighted)/global_sk_l1
+        d = max(d, min(d_hat, d*growth_rate))
+
+    if log_every &gt; 0 and k % log_every == 0:
+        logger.info(
+            f&#34;(k={k}) dlr: {dlr:1.1e} d_hat: {d_hat:1.1e}, d: {d:1.8}. &#34;
+            f&#34;sksq_weighted={global_sksq_weighted:1.1e} gsq_weighted={global_gsq_weighted:1.1e} &#34;
+            f&#34;sk_l1={global_sk_l1:1.1e}{&#39; (FSDP)&#39; if fsdp_in_use else &#39;&#39;}&#34;)
+
+    for group in self.param_groups:
+        group[&#39;gsq_weighted&#39;] = gsq_weighted
+        group[&#39;d&#39;] = d
+
+        group_lr = group[&#39;lr&#39;]
+        decay = group[&#39;weight_decay&#39;]
+        k = group[&#39;k&#39;]
+        eps = group[&#39;eps&#39;]
+
+        for p in group[&#39;params&#39;]:
+            if p.grad is None:
+                continue
+            grad = p.grad.data
+
+            state = self.state[p]
+
+            exp_avg, exp_avg_sq = state[&#39;exp_avg&#39;], state[&#39;exp_avg_sq&#39;]
+
+            state[&#39;step&#39;] += 1
+
+            denom = exp_avg_sq.sqrt().add_(eps)
+            denom = denom.type(p.type())
+
+            # Apply weight decay (decoupled variant)
+            if decay != 0 and decouple and group_lr &gt; 0:
+                p.data.add_(p.data, alpha=-decay * dlr)
+
+            # Take step
+            p.data.addcdiv_(exp_avg, denom, value=-1)
+
+        group[&#39;k&#39;] = k + 1
+
+    return loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.optim.dadam.to_real" href="#audiocraft.optim.dadam.to_real">to_real</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.dadam.DAdaptAdam" href="#audiocraft.optim.dadam.DAdaptAdam">DAdaptAdam</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.dadam.DAdaptAdam.OptimizerPostHook" href="#audiocraft.optim.dadam.DAdaptAdam.OptimizerPostHook">OptimizerPostHook</a></code></li>
+<li><code><a title="audiocraft.optim.dadam.DAdaptAdam.OptimizerPreHook" href="#audiocraft.optim.dadam.DAdaptAdam.OptimizerPreHook">OptimizerPreHook</a></code></li>
+<li><code><a title="audiocraft.optim.dadam.DAdaptAdam.step" href="#audiocraft.optim.dadam.DAdaptAdam.step">step</a></code></li>
+<li><code><a title="audiocraft.optim.dadam.DAdaptAdam.supports_flat_params" href="#audiocraft.optim.dadam.DAdaptAdam.supports_flat_params">supports_flat_params</a></code></li>
+<li><code><a title="audiocraft.optim.dadam.DAdaptAdam.supports_memory_efficient_fp16" href="#audiocraft.optim.dadam.DAdaptAdam.supports_memory_efficient_fp16">supports_memory_efficient_fp16</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/ema.html b/api_docs/audiocraft/optim/ema.html
new file mode 100644
index 00000000..6386e2c5
--- /dev/null
+++ b/api_docs/audiocraft/optim/ema.html
@@ -0,0 +1,273 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.ema API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.ema</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ModelEMA implementation is taken from
+# https://github.com/facebookresearch/demucs
+
+from collections import defaultdict
+import typing as tp
+
+import torch
+import torch.nn as nn
+
+
+def _get_all_non_persistent_buffers_set(module: nn.Module, root: str = &#34;&#34;) -&gt; set:
+    names: set = set()
+    for (name, sub_module) in module.named_modules():
+        if name == &#39;&#39;:
+            buffer_names = module._non_persistent_buffers_set
+            buffer_names = {f&#34;{root}.{buff_name}&#34; if len(root) &gt; 0 else buff_name
+                            for buff_name in buffer_names}
+            names.update(buffer_names)
+        else:
+            sub_name = f&#34;{root}.{name}&#34; if len(root) &gt; 0 else name
+            sub_buffer_names = _get_all_non_persistent_buffers_set(sub_module, sub_name)
+            names.update(sub_buffer_names)
+    return names
+
+
+def _get_named_tensors(module: nn.Module):
+    non_persistent_buffers_set = _get_all_non_persistent_buffers_set(module)
+    named_buffers = [(name, buffer) for (name, buffer) in module.named_buffers()
+                     if name not in non_persistent_buffers_set]
+    named_parameters = list(module.named_parameters())
+    return named_parameters + named_buffers
+
+
+class ModuleDictEMA:
+    &#34;&#34;&#34;Exponential Moving Average over a nn.ModuleDict.
+
+    You can switch to the EMA weights temporarily.
+    &#34;&#34;&#34;
+    def __init__(self, module_dict: nn.ModuleDict, decay: float = 0.999,
+                 unbias: bool = True, device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        self.decay = decay
+        self.module_dict = module_dict
+        self.state: dict = defaultdict(dict)
+        self.count = 0
+        self.device = device
+        self.unbias = unbias
+        self._init()
+
+    def _init(self):
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                if key not in self.state[module_name]:
+                    self.state[module_name][key] = val.detach().to(device, copy=True)
+
+    def step(self):
+        if self.unbias:
+            self.count = self.count * self.decay + 1
+            w = 1 / self.count
+        else:
+            w = 1 - self.decay
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                self.state[module_name][key].mul_(1 - w)
+                self.state[module_name][key].add_(val.detach().to(device), alpha=w)
+
+    def state_dict(self):
+        return {&#39;state&#39;: self.state, &#39;count&#39;: self.count}
+
+    def load_state_dict(self, state):
+        self.count = state[&#39;count&#39;]
+        for module_name, module in state[&#39;state&#39;].items():
+            for key, val in module.items():
+                self.state[module_name][key].copy_(val)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.ema.ModuleDictEMA"><code class="flex name class">
+<span>class <span class="ident">ModuleDictEMA</span></span>
+<span>(</span><span>module_dict: torch.nn.modules.container.ModuleDict, decay: float = 0.999, unbias: bool = True, device: Union[torch.device, str] = 'cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Exponential Moving Average over a nn.ModuleDict.</p>
+<p>You can switch to the EMA weights temporarily.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ModuleDictEMA:
+    &#34;&#34;&#34;Exponential Moving Average over a nn.ModuleDict.
+
+    You can switch to the EMA weights temporarily.
+    &#34;&#34;&#34;
+    def __init__(self, module_dict: nn.ModuleDict, decay: float = 0.999,
+                 unbias: bool = True, device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        self.decay = decay
+        self.module_dict = module_dict
+        self.state: dict = defaultdict(dict)
+        self.count = 0
+        self.device = device
+        self.unbias = unbias
+        self._init()
+
+    def _init(self):
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                if key not in self.state[module_name]:
+                    self.state[module_name][key] = val.detach().to(device, copy=True)
+
+    def step(self):
+        if self.unbias:
+            self.count = self.count * self.decay + 1
+            w = 1 / self.count
+        else:
+            w = 1 - self.decay
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                self.state[module_name][key].mul_(1 - w)
+                self.state[module_name][key].add_(val.detach().to(device), alpha=w)
+
+    def state_dict(self):
+        return {&#39;state&#39;: self.state, &#39;count&#39;: self.count}
+
+    def load_state_dict(self, state):
+        self.count = state[&#39;count&#39;]
+        for module_name, module in state[&#39;state&#39;].items():
+            for key, val in module.items():
+                self.state[module_name][key].copy_(val)</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.ema.ModuleDictEMA.load_state_dict"><code class="name flex">
+<span>def <span class="ident">load_state_dict</span></span>(<span>self, state)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_state_dict(self, state):
+    self.count = state[&#39;count&#39;]
+    for module_name, module in state[&#39;state&#39;].items():
+        for key, val in module.items():
+            self.state[module_name][key].copy_(val)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.ema.ModuleDictEMA.state_dict"><code class="name flex">
+<span>def <span class="ident">state_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def state_dict(self):
+    return {&#39;state&#39;: self.state, &#39;count&#39;: self.count}</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.ema.ModuleDictEMA.step"><code class="name flex">
+<span>def <span class="ident">step</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def step(self):
+    if self.unbias:
+        self.count = self.count * self.decay + 1
+        w = 1 / self.count
+    else:
+        w = 1 - self.decay
+    for module_name, module in self.module_dict.items():
+        for key, val in _get_named_tensors(module):
+            if not val.is_floating_point():
+                continue
+            device = self.device or val.device
+            self.state[module_name][key].mul_(1 - w)
+            self.state[module_name][key].add_(val.detach().to(device), alpha=w)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.ema.ModuleDictEMA" href="#audiocraft.optim.ema.ModuleDictEMA">ModuleDictEMA</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.ema.ModuleDictEMA.load_state_dict" href="#audiocraft.optim.ema.ModuleDictEMA.load_state_dict">load_state_dict</a></code></li>
+<li><code><a title="audiocraft.optim.ema.ModuleDictEMA.state_dict" href="#audiocraft.optim.ema.ModuleDictEMA.state_dict">state_dict</a></code></li>
+<li><code><a title="audiocraft.optim.ema.ModuleDictEMA.step" href="#audiocraft.optim.ema.ModuleDictEMA.step">step</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/fsdp.html b/api_docs/audiocraft/optim/fsdp.html
new file mode 100644
index 00000000..f3be8879
--- /dev/null
+++ b/api_docs/audiocraft/optim/fsdp.html
@@ -0,0 +1,428 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.fsdp API documentation</title>
+<meta name="description" content="Wrapper around FSDP for more convenient use in the training loops." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.fsdp</code></h1>
+</header>
+<section id="section-intro">
+<p>Wrapper around FSDP for more convenient use in the training loops.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Wrapper around FSDP for more convenient use in the training loops.
+&#34;&#34;&#34;
+
+from contextlib import contextmanager
+import typing as tp
+import dora
+import torch
+
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    MixedPrecision, ShardingStrategy, FullStateDictConfig, StateDictType)
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+
+
+def is_fsdp_used() -&gt; bool:
+    &#34;&#34;&#34;Return whether we are using FSDP.&#34;&#34;&#34;
+    # A bit of a hack but should work from anywhere.
+    if dora.is_xp():
+        cfg = dora.get_xp().cfg
+        if hasattr(cfg, &#39;fsdp&#39;):
+            return cfg.fsdp.use
+    return False
+
+
+def is_sharded_tensor(x: tp.Any) -&gt; bool:
+    return isinstance(x, ShardedTensor)
+
+
+@contextmanager
+def switch_to_full_state_dict(models: tp.List[FSDP]):
+    # Another bug in FSDP makes it that we cannot use the `state_dict_type` API,
+    # so let&#39;s do thing manually.
+    for model in models:
+        FSDP.set_state_dict_type(  # type: ignore
+            model, StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True))
+    try:
+        yield
+    finally:
+        for model in models:
+            FSDP.set_state_dict_type(model, StateDictType.LOCAL_STATE_DICT)  # type: ignore
+
+
+def wrap_with_fsdp(cfg, model: torch.nn.Module,
+                   block_classes: tp.Optional[tp.Set[tp.Type]] = None) -&gt; FSDP:
+    &#34;&#34;&#34;Wraps a model with FSDP.&#34;&#34;&#34;
+    # Some of the typing is disabled until this gets integrated
+    # into the stable version of PyTorch.
+    from torch.distributed.fsdp.wrap import ModuleWrapPolicy  # type: ignore
+
+    # we import this here to prevent circular import.
+    from ..modules.transformer import StreamingTransformerLayer
+    from ..modules.conditioners import ConditioningProvider
+
+    _fix_post_backward_hook()
+
+    assert cfg.use
+    sharding_strategy_dict = {
+        &#34;no_shard&#34;: ShardingStrategy.NO_SHARD,
+        &#34;shard_grad_op&#34;: ShardingStrategy.SHARD_GRAD_OP,
+        &#34;full_shard&#34;: ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        &#34;float32&#34;: torch.float32,
+        &#34;float16&#34;: torch.float16,
+        &#34;bfloat16&#34;: torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[cfg.param_dtype],
+        reduce_dtype=dtype_dict[cfg.reduce_dtype],
+        buffer_dtype=dtype_dict[cfg.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[cfg.sharding_strategy]
+    # The following is going to require being a bit smart
+    # when doing LM, because this would flush the weights for every time step
+    # during generation. One possiblity is to use hybrid sharding:
+    # See: https://pytorch.org/docs/master/fsdp.html#torch.distributed.fsdp.ShardingStrategy
+    assert sharding_strategy_config != ShardingStrategy.FULL_SHARD, \
+        &#34;Not supported at the moment, requires a bit more work.&#34;
+
+    local_rank = dora.distrib.get_distrib_spec().local_rank
+    assert local_rank &lt; torch.cuda.device_count(), &#34;Please upgrade Dora!&#34;
+
+    auto_wrap_policy = None
+    if block_classes is None:
+        block_classes = {StreamingTransformerLayer, ConditioningProvider}
+    if cfg.per_block:
+        auto_wrap_policy = ModuleWrapPolicy(block_classes)
+    wrapped = _FSDPFixStateDict(
+        model,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=auto_wrap_policy,
+    )  # type: ignore
+    FSDP.set_state_dict_type(wrapped, StateDictType.LOCAL_STATE_DICT)  # type: ignore
+
+    # Let the wrapped model know about the wrapping!
+    # We use __dict__ to avoid it going into the state dict.
+    # This is a bit dirty, but needed during generation, as otherwise
+    # the wrapped model would call itself and bypass FSDP.
+    for module in FSDP.fsdp_modules(wrapped):
+        original = module._fsdp_wrapped_module
+        original.__dict__[&#39;_fsdp&#39;] = module
+    return wrapped
+
+
+def purge_fsdp(model: FSDP):
+    &#34;&#34;&#34;Purge the FSDP cached shard inside the model. This should
+    allow setting the best state or switching to the EMA.
+    &#34;&#34;&#34;
+    from torch.distributed.fsdp._runtime_utils import _reshard  # type: ignore
+    for module in FSDP.fsdp_modules(model):
+        handles = module._handles
+        if not handles:
+            continue
+        handle = handles[0]
+        unsharded_flat_param = handle._get_padded_unsharded_flat_param()
+        storage_size: int = unsharded_flat_param._typed_storage()._size()  # type: ignore
+        if storage_size == 0:
+            continue
+        true_list = [True for h in handles]
+        _reshard(module, handles, true_list)
+
+
+class _FSDPFixStateDict(FSDP):
+    @staticmethod
+    def _name_without_fsdp_prefix(name: str) -&gt; str:
+        from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE  # type: ignore
+        parts = name.split(&#39;.&#39;)
+        new_parts = [part for part in parts if part != FSDP_WRAPPED_MODULE]
+        return &#39;.&#39;.join(new_parts)
+
+    def state_dict(self, *args, **kwargs) -&gt; tp.Dict[str, tp.Any]:  # type: ignore
+        state = dict(super().state_dict(*args, **kwargs))
+        for key, value in list(state.items()):
+            if is_sharded_tensor(value):
+                del state[key]
+        return state
+
+    def load_state_dict(self, state: tp.Dict[str, tp.Any]):  # type: ignore
+        if self._state_dict_type is StateDictType.FULL_STATE_DICT:
+            super().load_state_dict(state)
+            purge_fsdp(self)
+            return
+        # Fix FSDP load state dict in all situation.
+        # Use this only with LOCAL_STATE_DICT !!!
+        current_state = dict(super().state_dict())
+        for key, value in state.items():
+            key = _FSDPFixStateDict._name_without_fsdp_prefix(key)
+            if key not in current_state:
+                # Emulate strict loading manually.
+                raise RuntimeError(f&#34;Unknown state key {key}&#34;)
+            current_state[key].copy_(value)
+
+        # Purging cached weights from previous forward.
+        purge_fsdp(self)
+
+
+_hook_fixed = False
+
+
+def _fix_post_backward_hook():
+    global _hook_fixed
+    if _hook_fixed:
+        return
+    _hook_fixed = True
+
+    from torch.distributed.fsdp import _runtime_utils
+    from torch.distributed.fsdp._common_utils import TrainingState, HandleTrainingState
+    old_hook = _runtime_utils._post_backward_hook
+
+    def _post_backward_hook(state, handle, *args, **kwargs):
+        checkpointed = getattr(state._fsdp_wrapped_module, &#39;_audiocraft_checkpointed&#39;, False)
+        if checkpointed:
+            # there will be one more forward in the backward with checkpointing and that will
+            # massively confuse FSDP, so we have to make it think everything
+            # is going according to the plan.
+            state.training_state = TrainingState.FORWARD_BACKWARD
+            handle._training_state = HandleTrainingState.BACKWARD_PRE
+        old_hook(state, handle, *args, **kwargs)
+
+    _runtime_utils._post_backward_hook = _post_backward_hook</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.optim.fsdp.is_fsdp_used"><code class="name flex">
+<span>def <span class="ident">is_fsdp_used</span></span>(<span>) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return whether we are using FSDP.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def is_fsdp_used() -&gt; bool:
+    &#34;&#34;&#34;Return whether we are using FSDP.&#34;&#34;&#34;
+    # A bit of a hack but should work from anywhere.
+    if dora.is_xp():
+        cfg = dora.get_xp().cfg
+        if hasattr(cfg, &#39;fsdp&#39;):
+            return cfg.fsdp.use
+    return False</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.fsdp.is_sharded_tensor"><code class="name flex">
+<span>def <span class="ident">is_sharded_tensor</span></span>(<span>x: Any) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def is_sharded_tensor(x: tp.Any) -&gt; bool:
+    return isinstance(x, ShardedTensor)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.fsdp.purge_fsdp"><code class="name flex">
+<span>def <span class="ident">purge_fsdp</span></span>(<span>model: torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Purge the FSDP cached shard inside the model. This should
+allow setting the best state or switching to the EMA.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def purge_fsdp(model: FSDP):
+    &#34;&#34;&#34;Purge the FSDP cached shard inside the model. This should
+    allow setting the best state or switching to the EMA.
+    &#34;&#34;&#34;
+    from torch.distributed.fsdp._runtime_utils import _reshard  # type: ignore
+    for module in FSDP.fsdp_modules(model):
+        handles = module._handles
+        if not handles:
+            continue
+        handle = handles[0]
+        unsharded_flat_param = handle._get_padded_unsharded_flat_param()
+        storage_size: int = unsharded_flat_param._typed_storage()._size()  # type: ignore
+        if storage_size == 0:
+            continue
+        true_list = [True for h in handles]
+        _reshard(module, handles, true_list)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.fsdp.switch_to_full_state_dict"><code class="name flex">
+<span>def <span class="ident">switch_to_full_state_dict</span></span>(<span>models: List[torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def switch_to_full_state_dict(models: tp.List[FSDP]):
+    # Another bug in FSDP makes it that we cannot use the `state_dict_type` API,
+    # so let&#39;s do thing manually.
+    for model in models:
+        FSDP.set_state_dict_type(  # type: ignore
+            model, StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True))
+    try:
+        yield
+    finally:
+        for model in models:
+            FSDP.set_state_dict_type(model, StateDictType.LOCAL_STATE_DICT)  # type: ignore</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.optim.fsdp.wrap_with_fsdp"><code class="name flex">
+<span>def <span class="ident">wrap_with_fsdp</span></span>(<span>cfg, model: torch.nn.modules.module.Module, block_classes: Optional[Set[Type[+CT_co]]] = None) ‑> torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wraps a model with FSDP.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def wrap_with_fsdp(cfg, model: torch.nn.Module,
+                   block_classes: tp.Optional[tp.Set[tp.Type]] = None) -&gt; FSDP:
+    &#34;&#34;&#34;Wraps a model with FSDP.&#34;&#34;&#34;
+    # Some of the typing is disabled until this gets integrated
+    # into the stable version of PyTorch.
+    from torch.distributed.fsdp.wrap import ModuleWrapPolicy  # type: ignore
+
+    # we import this here to prevent circular import.
+    from ..modules.transformer import StreamingTransformerLayer
+    from ..modules.conditioners import ConditioningProvider
+
+    _fix_post_backward_hook()
+
+    assert cfg.use
+    sharding_strategy_dict = {
+        &#34;no_shard&#34;: ShardingStrategy.NO_SHARD,
+        &#34;shard_grad_op&#34;: ShardingStrategy.SHARD_GRAD_OP,
+        &#34;full_shard&#34;: ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        &#34;float32&#34;: torch.float32,
+        &#34;float16&#34;: torch.float16,
+        &#34;bfloat16&#34;: torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[cfg.param_dtype],
+        reduce_dtype=dtype_dict[cfg.reduce_dtype],
+        buffer_dtype=dtype_dict[cfg.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[cfg.sharding_strategy]
+    # The following is going to require being a bit smart
+    # when doing LM, because this would flush the weights for every time step
+    # during generation. One possiblity is to use hybrid sharding:
+    # See: https://pytorch.org/docs/master/fsdp.html#torch.distributed.fsdp.ShardingStrategy
+    assert sharding_strategy_config != ShardingStrategy.FULL_SHARD, \
+        &#34;Not supported at the moment, requires a bit more work.&#34;
+
+    local_rank = dora.distrib.get_distrib_spec().local_rank
+    assert local_rank &lt; torch.cuda.device_count(), &#34;Please upgrade Dora!&#34;
+
+    auto_wrap_policy = None
+    if block_classes is None:
+        block_classes = {StreamingTransformerLayer, ConditioningProvider}
+    if cfg.per_block:
+        auto_wrap_policy = ModuleWrapPolicy(block_classes)
+    wrapped = _FSDPFixStateDict(
+        model,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=auto_wrap_policy,
+    )  # type: ignore
+    FSDP.set_state_dict_type(wrapped, StateDictType.LOCAL_STATE_DICT)  # type: ignore
+
+    # Let the wrapped model know about the wrapping!
+    # We use __dict__ to avoid it going into the state dict.
+    # This is a bit dirty, but needed during generation, as otherwise
+    # the wrapped model would call itself and bypass FSDP.
+    for module in FSDP.fsdp_modules(wrapped):
+        original = module._fsdp_wrapped_module
+        original.__dict__[&#39;_fsdp&#39;] = module
+    return wrapped</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.optim.fsdp.is_fsdp_used" href="#audiocraft.optim.fsdp.is_fsdp_used">is_fsdp_used</a></code></li>
+<li><code><a title="audiocraft.optim.fsdp.is_sharded_tensor" href="#audiocraft.optim.fsdp.is_sharded_tensor">is_sharded_tensor</a></code></li>
+<li><code><a title="audiocraft.optim.fsdp.purge_fsdp" href="#audiocraft.optim.fsdp.purge_fsdp">purge_fsdp</a></code></li>
+<li><code><a title="audiocraft.optim.fsdp.switch_to_full_state_dict" href="#audiocraft.optim.fsdp.switch_to_full_state_dict">switch_to_full_state_dict</a></code></li>
+<li><code><a title="audiocraft.optim.fsdp.wrap_with_fsdp" href="#audiocraft.optim.fsdp.wrap_with_fsdp">wrap_with_fsdp</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/index.html b/api_docs/audiocraft/optim/index.html
new file mode 100644
index 00000000..ccd2354b
--- /dev/null
+++ b/api_docs/audiocraft/optim/index.html
@@ -0,0 +1,119 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim API documentation</title>
+<meta name="description" content="Optimization stuff. In particular, optimizers (DAdaptAdam), schedulers
+and Exponential Moving Average." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim</code></h1>
+</header>
+<section id="section-intro">
+<p>Optimization stuff. In particular, optimizers (DAdaptAdam), schedulers
+and Exponential Moving Average.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Optimization stuff. In particular, optimizers (DAdaptAdam), schedulers
+and Exponential Moving Average.
+&#34;&#34;&#34;
+
+# flake8: noqa
+from .cosine_lr_scheduler import CosineLRScheduler
+from .dadam import DAdaptAdam
+from .inverse_sqrt_lr_scheduler import InverseSquareRootLRScheduler
+from .linear_warmup_lr_scheduler import LinearWarmupLRScheduler
+from .polynomial_decay_lr_scheduler import PolynomialDecayLRScheduler
+from .ema import ModuleDictEMA</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.optim.cosine_lr_scheduler" href="cosine_lr_scheduler.html">audiocraft.optim.cosine_lr_scheduler</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.dadam" href="dadam.html">audiocraft.optim.dadam</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.ema" href="ema.html">audiocraft.optim.ema</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.fsdp" href="fsdp.html">audiocraft.optim.fsdp</a></code></dt>
+<dd>
+<div class="desc"><p>Wrapper around FSDP for more convenient use in the training loops.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.inverse_sqrt_lr_scheduler" href="inverse_sqrt_lr_scheduler.html">audiocraft.optim.inverse_sqrt_lr_scheduler</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.linear_warmup_lr_scheduler" href="linear_warmup_lr_scheduler.html">audiocraft.optim.linear_warmup_lr_scheduler</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.optim.polynomial_decay_lr_scheduler" href="polynomial_decay_lr_scheduler.html">audiocraft.optim.polynomial_decay_lr_scheduler</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.optim.cosine_lr_scheduler" href="cosine_lr_scheduler.html">audiocraft.optim.cosine_lr_scheduler</a></code></li>
+<li><code><a title="audiocraft.optim.dadam" href="dadam.html">audiocraft.optim.dadam</a></code></li>
+<li><code><a title="audiocraft.optim.ema" href="ema.html">audiocraft.optim.ema</a></code></li>
+<li><code><a title="audiocraft.optim.fsdp" href="fsdp.html">audiocraft.optim.fsdp</a></code></li>
+<li><code><a title="audiocraft.optim.inverse_sqrt_lr_scheduler" href="inverse_sqrt_lr_scheduler.html">audiocraft.optim.inverse_sqrt_lr_scheduler</a></code></li>
+<li><code><a title="audiocraft.optim.linear_warmup_lr_scheduler" href="linear_warmup_lr_scheduler.html">audiocraft.optim.linear_warmup_lr_scheduler</a></code></li>
+<li><code><a title="audiocraft.optim.polynomial_decay_lr_scheduler" href="polynomial_decay_lr_scheduler.html">audiocraft.optim.polynomial_decay_lr_scheduler</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/inverse_sqrt_lr_scheduler.html b/api_docs/audiocraft/optim/inverse_sqrt_lr_scheduler.html
new file mode 100644
index 00000000..d65c9231
--- /dev/null
+++ b/api_docs/audiocraft/optim/inverse_sqrt_lr_scheduler.html
@@ -0,0 +1,178 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.inverse_sqrt_lr_scheduler API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.inverse_sqrt_lr_scheduler</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class InverseSquareRootLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Inverse square root LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        else:
+            decay_factor = lr * self.warmup_steps**0.5
+            lr = decay_factor * step**-0.5
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self._step_count) for base_lr in self.base_lrs]</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler"><code class="flex name class">
+<span>class <span class="ident">InverseSquareRootLRScheduler</span></span>
+<span>(</span><span>optimizer: torch.optim.optimizer.Optimizer, warmup_steps: int, warmup_init_lr: Optional[float] = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Inverse square root LR scheduler.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>Optimizer</code></dt>
+<dd>Torch optimizer.</dd>
+<dt><strong><code>warmup_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of warmup steps.</dd>
+<dt><strong><code>warmup_init_lr</code></strong> :&ensp;<code>tp.Optional[float]</code></dt>
+<dd>Initial learning rate
+during warmup phase. When not set, use the provided learning rate.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class InverseSquareRootLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Inverse square root LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        else:
+            decay_factor = lr * self.warmup_steps**0.5
+            lr = decay_factor * step**-0.5
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self._step_count) for base_lr in self.base_lrs]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.optim.lr_scheduler._LRScheduler</li>
+<li>torch.optim.lr_scheduler.LRScheduler</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler.get_lr"><code class="name flex">
+<span>def <span class="ident">get_lr</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lr(self):
+    return [self._get_sched_lr(base_lr, self._step_count) for base_lr in self.base_lrs]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler" href="#audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler">InverseSquareRootLRScheduler</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler.get_lr" href="#audiocraft.optim.inverse_sqrt_lr_scheduler.InverseSquareRootLRScheduler.get_lr">get_lr</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/linear_warmup_lr_scheduler.html b/api_docs/audiocraft/optim/linear_warmup_lr_scheduler.html
new file mode 100644
index 00000000..810765e7
--- /dev/null
+++ b/api_docs/audiocraft/optim/linear_warmup_lr_scheduler.html
@@ -0,0 +1,172 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.linear_warmup_lr_scheduler API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.linear_warmup_lr_scheduler</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LinearWarmupLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Inverse square root LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler"><code class="flex name class">
+<span>class <span class="ident">LinearWarmupLRScheduler</span></span>
+<span>(</span><span>optimizer: torch.optim.optimizer.Optimizer, warmup_steps: int, warmup_init_lr: Optional[float] = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Inverse square root LR scheduler.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>Optimizer</code></dt>
+<dd>Torch optimizer.</dd>
+<dt><strong><code>warmup_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of warmup steps.</dd>
+<dt><strong><code>warmup_init_lr</code></strong> :&ensp;<code>tp.Optional[float]</code></dt>
+<dd>Initial learning rate
+during warmup phase. When not set, use the provided learning rate.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LinearWarmupLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Inverse square root LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step &lt; self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.optim.lr_scheduler._LRScheduler</li>
+<li>torch.optim.lr_scheduler.LRScheduler</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler.get_lr"><code class="name flex">
+<span>def <span class="ident">get_lr</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lr(self):
+    return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler" href="#audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler">LinearWarmupLRScheduler</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler.get_lr" href="#audiocraft.optim.linear_warmup_lr_scheduler.LinearWarmupLRScheduler.get_lr">get_lr</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/optim/polynomial_decay_lr_scheduler.html b/api_docs/audiocraft/optim/polynomial_decay_lr_scheduler.html
new file mode 100644
index 00000000..e6dd3000
--- /dev/null
+++ b/api_docs/audiocraft/optim/polynomial_decay_lr_scheduler.html
@@ -0,0 +1,203 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.optim.polynomial_decay_lr_scheduler API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.optim.polynomial_decay_lr_scheduler</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class PolynomialDecayLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Polynomial decay LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        end_lr (float): Final learning rate to achieve over total number of steps.
+        zero_lr_warmup_steps (int): Number of steps with a learning rate of value 0.
+        power (float): Decay exponent.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, total_steps: int,
+                 end_lr: float = 0., zero_lr_warmup_steps: int = 0, power: float = 1.):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.end_lr = end_lr
+        self.zero_lr_warmup_steps = zero_lr_warmup_steps
+        self.power = power
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if self.zero_lr_warmup_steps &gt; 0 and step &lt;= self.zero_lr_warmup_steps:
+            lr = 0
+        elif self.warmup_steps &gt; 0 and step &lt;= self.warmup_steps + self.zero_lr_warmup_steps:
+            lr_ratio = (step - self.zero_lr_warmup_steps) / float(self.warmup_steps)
+            lr = lr_ratio * lr
+        elif step &gt;= self.total_steps:
+            lr = self.end_lr
+        else:
+            total_warmup_steps = self.warmup_steps + self.zero_lr_warmup_steps
+            lr_range = lr - self.end_lr
+            pct_remaining = 1 - (step - total_warmup_steps) / (self.total_steps - total_warmup_steps)
+            lr = lr_range * pct_remaining ** self.power + self.end_lr
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler"><code class="flex name class">
+<span>class <span class="ident">PolynomialDecayLRScheduler</span></span>
+<span>(</span><span>optimizer: torch.optim.optimizer.Optimizer, warmup_steps: int, total_steps: int, end_lr: float = 0.0, zero_lr_warmup_steps: int = 0, power: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Polynomial decay LR scheduler.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>Optimizer</code></dt>
+<dd>Torch optimizer.</dd>
+<dt><strong><code>warmup_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of warmup steps.</dd>
+<dt><strong><code>total_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total number of steps.</dd>
+<dt><strong><code>end_lr</code></strong> :&ensp;<code>float</code></dt>
+<dd>Final learning rate to achieve over total number of steps.</dd>
+<dt><strong><code>zero_lr_warmup_steps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of steps with a learning rate of value 0.</dd>
+<dt><strong><code>power</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay exponent.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PolynomialDecayLRScheduler(_LRScheduler):
+    &#34;&#34;&#34;Polynomial decay LR scheduler.
+
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        end_lr (float): Final learning rate to achieve over total number of steps.
+        zero_lr_warmup_steps (int): Number of steps with a learning rate of value 0.
+        power (float): Decay exponent.
+    &#34;&#34;&#34;
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, total_steps: int,
+                 end_lr: float = 0., zero_lr_warmup_steps: int = 0, power: float = 1.):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.end_lr = end_lr
+        self.zero_lr_warmup_steps = zero_lr_warmup_steps
+        self.power = power
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if self.zero_lr_warmup_steps &gt; 0 and step &lt;= self.zero_lr_warmup_steps:
+            lr = 0
+        elif self.warmup_steps &gt; 0 and step &lt;= self.warmup_steps + self.zero_lr_warmup_steps:
+            lr_ratio = (step - self.zero_lr_warmup_steps) / float(self.warmup_steps)
+            lr = lr_ratio * lr
+        elif step &gt;= self.total_steps:
+            lr = self.end_lr
+        else:
+            total_warmup_steps = self.warmup_steps + self.zero_lr_warmup_steps
+            lr_range = lr - self.end_lr
+            pct_remaining = 1 - (step - total_warmup_steps) / (self.total_steps - total_warmup_steps)
+            lr = lr_range * pct_remaining ** self.power + self.end_lr
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.optim.lr_scheduler._LRScheduler</li>
+<li>torch.optim.lr_scheduler.LRScheduler</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler.get_lr"><code class="name flex">
+<span>def <span class="ident">get_lr</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lr(self):
+    return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.optim" href="index.html">audiocraft.optim</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler" href="#audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler">PolynomialDecayLRScheduler</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler.get_lr" href="#audiocraft.optim.polynomial_decay_lr_scheduler.PolynomialDecayLRScheduler.get_lr">get_lr</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/quantization/base.html b/api_docs/audiocraft/quantization/base.html
new file mode 100644
index 00000000..d23cea1c
--- /dev/null
+++ b/api_docs/audiocraft/quantization/base.html
@@ -0,0 +1,544 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.base API documentation</title>
+<meta name="description" content="Base class for all quantizers." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.base</code></h1>
+</header>
+<section id="section-intro">
+<p>Base class for all quantizers.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Base class for all quantizers.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass, field
+import typing as tp
+
+import torch
+from torch import nn
+
+
+@dataclass
+class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)
+
+
+class BaseQuantizer(nn.Module):
+    &#34;&#34;&#34;Base class for quantizers.
+    &#34;&#34;&#34;
+
+    def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+        &#34;&#34;&#34;
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Number of active codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class DummyQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Fake quantizer that actually does not perform any quantization.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return x.unsqueeze(1)
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return codes.squeeze(1)
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        return 1
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        return self.total_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.&#34;&#34;&#34;
+        raise AttributeError(&#34;Cannot override the number of codebooks for the dummy quantizer&#34;)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer"><code class="flex name class">
+<span>class <span class="ident">BaseQuantizer</span></span>
+<span>(</span><span>*args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for quantizers.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseQuantizer(nn.Module):
+    &#34;&#34;&#34;Base class for quantizers.
+    &#34;&#34;&#34;
+
+    def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+        &#34;&#34;&#34;
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Number of active codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.&#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.DummyQuantizer" href="#audiocraft.quantization.base.DummyQuantizer">DummyQuantizer</a></li>
+<li><a title="audiocraft.quantization.vq.ResidualVectorQuantizer" href="vq.html#audiocraft.quantization.vq.ResidualVectorQuantizer">ResidualVectorQuantizer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Number of active codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Number of active codebooks.&#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to the quantized representation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Decode the given codes to the quantized representation.&#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified sample rate at the given bandwidth.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.&#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, frame_rate: int) ‑> <a title="audiocraft.quantization.base.QuantizedResult" href="#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Given input tensor x, returns first the quantized (or approximately quantized)
+representation along with quantized codes, bandwidth, and any penalty term for the loss.
+Finally, this returns a dict of metrics to update logging etc.
+Frame rate must be passed so that the bandwidth is properly computed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+    &#34;&#34;&#34;
+    Given input tensor x, returns first the quantized (or approximately quantized)
+    representation along with quantized codes, bandwidth, and any penalty term for the loss.
+    Finally, this returns a dict of metrics to update logging etc.
+    Frame rate must be passed so that the bandwidth is properly computed.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the number of active codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the number of active codebooks.&#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer"><code class="flex name class">
+<span>class <span class="ident">DummyQuantizer</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Fake quantizer that actually does not perform any quantization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DummyQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Fake quantizer that actually does not perform any quantization.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return x.unsqueeze(1)
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return codes.squeeze(1)
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        return 1
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+        return self.total_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.&#34;&#34;&#34;
+        raise AttributeError(&#34;Cannot override the number of codebooks for the dummy quantizer&#34;)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Total number of codebooks.&#34;&#34;&#34;
+    return self.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to the quantized representation.
+In the case of the DummyQuantizer, the codes are actually identical
+to the input and resulting quantized representation as no quantization is done.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Decode the given codes to the quantized representation.
+    In the case of the DummyQuantizer, the codes are actually identical
+    to the input and resulting quantized representation as no quantization is done.
+    &#34;&#34;&#34;
+    return codes.squeeze(1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified sample rate at the given bandwidth.
+In the case of the DummyQuantizer, the codes are actually identical
+to the input and resulting quantized representation as no quantization is done.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+    In the case of the DummyQuantizer, the codes are actually identical
+    to the input and resulting quantized representation as no quantization is done.
+    &#34;&#34;&#34;
+    return x.unsqueeze(1)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult"><code class="flex name class">
+<span>class <span class="ident">QuantizedResult</span></span>
+<span>(</span><span>x: torch.Tensor, codes: torch.Tensor, bandwidth: torch.Tensor, penalty: Optional[torch.Tensor] = None, metrics: dict = &lt;factory&gt;)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>QuantizedResult(x: torch.Tensor, codes: torch.Tensor, bandwidth: torch.Tensor, penalty: Union[torch.Tensor, NoneType] = None, metrics: dict = <factory>)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.QuantizedResult.bandwidth"><code class="name">var <span class="ident">bandwidth</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.codes"><code class="name">var <span class="ident">codes</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.metrics"><code class="name">var <span class="ident">metrics</span> : dict</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.penalty"><code class="name">var <span class="ident">penalty</span> : Optional[torch.Tensor]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.x"><code class="name">var <span class="ident">x</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.call_super_init" href="#audiocraft.quantization.base.BaseQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.decode" href="#audiocraft.quantization.base.BaseQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.dump_patches" href="#audiocraft.quantization.base.BaseQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.encode" href="#audiocraft.quantization.base.BaseQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.training" href="#audiocraft.quantization.base.BaseQuantizer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.base.DummyQuantizer" href="#audiocraft.quantization.base.DummyQuantizer">DummyQuantizer</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.call_super_init" href="#audiocraft.quantization.base.DummyQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.decode" href="#audiocraft.quantization.base.DummyQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.dump_patches" href="#audiocraft.quantization.base.DummyQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.encode" href="#audiocraft.quantization.base.DummyQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.num_codebooks" href="#audiocraft.quantization.base.DummyQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.training" href="#audiocraft.quantization.base.DummyQuantizer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.base.QuantizedResult" href="#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.bandwidth" href="#audiocraft.quantization.base.QuantizedResult.bandwidth">bandwidth</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.codes" href="#audiocraft.quantization.base.QuantizedResult.codes">codes</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.metrics" href="#audiocraft.quantization.base.QuantizedResult.metrics">metrics</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.penalty" href="#audiocraft.quantization.base.QuantizedResult.penalty">penalty</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.x" href="#audiocraft.quantization.base.QuantizedResult.x">x</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/quantization/core_vq.html b/api_docs/audiocraft/quantization/core_vq.html
new file mode 100644
index 00000000..1fa50214
--- /dev/null
+++ b/api_docs/audiocraft/quantization/core_vq.html
@@ -0,0 +1,1553 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.core_vq API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.core_vq</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from einops import rearrange, repeat
+import flashy
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+
+def exists(val: tp.Optional[tp.Any]) -&gt; bool:
+    return val is not None
+
+
+def default(val: tp.Any, d: tp.Any) -&gt; tp.Any:
+    return val if exists(val) else d
+
+
+def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)
+
+
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+
+
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+
+
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+
+
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+
+    if num_samples &gt;= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+
+    return samples[indices]
+
+
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+
+    means = sample_vectors(samples, num_clusters)
+
+    for _ in range(num_iters):
+        diffs = rearrange(samples, &#34;n d -&gt; n () d&#34;) - rearrange(
+            means, &#34;c d -&gt; () c d&#34;
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, &#34;n -&gt; n d&#34;, d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return means, bins
+
+
+def orthogonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    n = t.shape[0]
+    normed_codes = l2norm(t)
+    identity = torch.eye(n, device=t.device)
+    cosine_sim = einsum(&#34;i d, j d -&gt; i j&#34;, normed_codes, normed_codes)
+    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)
+
+
+class EuclideanCodebook(nn.Module):
+    &#34;&#34;&#34;Codebook with Euclidean distance.
+
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+
+        self.codebook_size = codebook_size
+
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+
+        self.register_buffer(&#34;inited&#34;, torch.Tensor([not kmeans_init]))
+        self.register_buffer(&#34;cluster_size&#34;, torch.zeros(codebook_size))
+        self.register_buffer(&#34;embed&#34;, embed)
+        self.register_buffer(&#34;embed_avg&#34;, embed.clone())
+
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+
+        expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+
+        batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+        self.replace_(batch_samples, mask=expired_codes)
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def preprocess(self, x):
+        x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+        return x
+
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+
+        return quantize, embed_ind
+
+
+class VectorQuantization(nn.Module):
+    &#34;&#34;&#34;Vector quantization implementation.
+    Currently supports only euclidean distance.
+
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int):
+        channels_last (bool): Channels are the last dimension in the input tensors.
+        commitment_weight (float): Weight for commitment loss.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
+            for orthogonal regularization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = False,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        channels_last: bool = False,
+        commitment_weight: float = 1.,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+
+        self.channels_last = channels_last
+
+    @property
+    def codebook(self):
+        return self._codebook.embed
+
+    @property
+    def inited(self):
+        return self._codebook.inited
+
+    def _preprocess(self, x):
+        if not self.channels_last:
+            x = rearrange(x, &#34;b d n -&gt; b n d&#34;)
+        return x
+
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            quantize = rearrange(quantize, &#34;b n d -&gt; b d n&#34;)
+        return quantize
+
+    def encode(self, x):
+        x = self._preprocess(x)
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+
+    def forward(self, x):
+        device = x.device
+        x = self._preprocess(x)
+
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+
+        if self.training:
+            quantize = x + (quantize - x).detach()
+
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+        if self.training:
+            if self.commitment_weight &gt; 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+
+            if self.orthogonal_reg_weight &gt; 0:
+                codebook = self.codebook
+
+                if self.orthogonal_reg_active_codes_only:
+                    # only calculate orthogonal loss for the activated codes for this batch
+                    unique_code_ids = torch.unique(embed_ind)
+                    codebook = codebook[unique_code_ids]
+
+                num_codes = codebook.shape[0]
+                if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                    codebook = codebook[rand_ids]
+
+                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+
+        return quantize, embed_ind, loss
+
+
+class ResidualVectorQuantization(nn.Module):
+    &#34;&#34;&#34;Residual vector quantization implementation.
+
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    &#34;&#34;&#34;
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+
+        all_losses = []
+        all_indices = []
+
+        n_q = n_q or len(self.layers)
+
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            quantized = quantized.detach()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+
+        if self.training:
+            # Solving subtle bug with STE and RVQ: https://github.com/facebookresearch/encodec/issues/25
+            quantized_out = x + (quantized_out - x).detach()
+
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.quantization.core_vq.default"><code class="name flex">
+<span>def <span class="ident">default</span></span>(<span>val: Any, d: Any) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def default(val: tp.Any, d: tp.Any) -&gt; tp.Any:
+    return val if exists(val) else d</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ema_inplace"><code class="name flex">
+<span>def <span class="ident">ema_inplace</span></span>(<span>moving_avg, new, decay: float)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.exists"><code class="name flex">
+<span>def <span class="ident">exists</span></span>(<span>val: Optional[Any]) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def exists(val: tp.Optional[tp.Any]) -&gt; bool:
+    return val is not None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.kmeans"><code class="name flex">
+<span>def <span class="ident">kmeans</span></span>(<span>samples, num_clusters: int, num_iters: int = 10)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+
+    means = sample_vectors(samples, num_clusters)
+
+    for _ in range(num_iters):
+        diffs = rearrange(samples, &#34;n d -&gt; n () d&#34;) - rearrange(
+            means, &#34;c d -&gt; () c d&#34;
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, &#34;n -&gt; n d&#34;, d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return means, bins</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.l2norm"><code class="name flex">
+<span>def <span class="ident">l2norm</span></span>(<span>t)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.laplace_smoothing"><code class="name flex">
+<span>def <span class="ident">laplace_smoothing</span></span>(<span>x, n_categories: int, epsilon: float = 1e-05)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.orthogonal_loss_fn"><code class="name flex">
+<span>def <span class="ident">orthogonal_loss_fn</span></span>(<span>t)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def orthogonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    n = t.shape[0]
+    normed_codes = l2norm(t)
+    identity = torch.eye(n, device=t.device)
+    cosine_sim = einsum(&#34;i d, j d -&gt; i j&#34;, normed_codes, normed_codes)
+    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.sample_vectors"><code class="name flex">
+<span>def <span class="ident">sample_vectors</span></span>(<span>samples, num: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+
+    if num_samples &gt;= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+
+    return samples[indices]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.uniform_init"><code class="name flex">
+<span>def <span class="ident">uniform_init</span></span>(<span>*shape: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook"><code class="flex name class">
+<span>class <span class="ident">EuclideanCodebook</span></span>
+<span>(</span><span>dim: int, codebook_size: int, kmeans_init: int = False, kmeans_iters: int = 10, decay: float = 0.8, epsilon: float = 1e-05, threshold_ema_dead_code: int = 2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Codebook with Euclidean distance.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension.</dd>
+<dt><strong><code>codebook_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use k-means to initialize the codebooks.
+If set to true, run the k-means algorithm on the first training batch and use
+the learned centroids as initialization.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for k-means algorithm at initialization.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EuclideanCodebook(nn.Module):
+    &#34;&#34;&#34;Codebook with Euclidean distance.
+
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+
+        self.codebook_size = codebook_size
+
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+
+        self.register_buffer(&#34;inited&#34;, torch.Tensor([not kmeans_init]))
+        self.register_buffer(&#34;cluster_size&#34;, torch.zeros(codebook_size))
+        self.register_buffer(&#34;embed&#34;, embed)
+        self.register_buffer(&#34;embed_avg&#34;, embed.clone())
+
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+
+        expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+
+        batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+        self.replace_(batch_samples, mask=expired_codes)
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def preprocess(self, x):
+        x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+        return x
+
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+
+        return quantize, embed_ind</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, embed_ind):
+    quantize = self.dequantize(embed_ind)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.dequantize"><code class="name flex">
+<span>def <span class="ident">dequantize</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dequantize(self, embed_ind):
+    quantize = F.embedding(embed_ind, self.embed)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x):
+    shape = x.shape
+    # pre-process
+    x = self.preprocess(x)
+    # quantize
+    embed_ind = self.quantize(x)
+    # post-process
+    embed_ind = self.postprocess_emb(embed_ind, shape)
+    return embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_"><code class="name flex">
+<span>def <span class="ident">expire_codes_</span></span>(<span>self, batch_samples)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def expire_codes_(self, batch_samples):
+    if self.threshold_ema_dead_code == 0:
+        return
+
+    expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+    if not torch.any(expired_codes):
+        return
+
+    batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+    self.replace_(batch_samples, mask=expired_codes)
+    flashy.distrib.broadcast_tensors(self.buffers())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    shape, dtype = x.shape, x.dtype
+    x = self.preprocess(x)
+    self.init_embed_(x)
+
+    embed_ind = self.quantize(x)
+    embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+    embed_ind = self.postprocess_emb(embed_ind, shape)
+    quantize = self.dequantize(embed_ind)
+
+    if self.training:
+        # We do the expiry of code at that point as buffers are in sync
+        # and all the workers will take the same decision.
+        self.expire_codes_(x)
+        ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+        embed_sum = x.t() @ embed_onehot
+        ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+        cluster_size = (
+            laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+            * self.cluster_size.sum()
+        )
+        embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+        self.embed.data.copy_(embed_normalized)
+
+    return quantize, embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_"><code class="name flex">
+<span>def <span class="ident">init_embed_</span></span>(<span>self, data)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.jit.ignore
+def init_embed_(self, data):
+    if self.inited:
+        return
+
+    embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+    self.embed.data.copy_(embed)
+    self.embed_avg.data.copy_(embed.clone())
+    self.cluster_size.data.copy_(cluster_size)
+    self.inited.data.copy_(torch.Tensor([True]))
+    # Make sure all buffers across workers are in sync after initialization
+    flashy.distrib.broadcast_tensors(self.buffers())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb"><code class="name flex">
+<span>def <span class="ident">postprocess_emb</span></span>(<span>self, embed_ind, shape)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def postprocess_emb(self, embed_ind, shape):
+    return embed_ind.view(*shape[:-1])</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.preprocess"><code class="name flex">
+<span>def <span class="ident">preprocess</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def preprocess(self, x):
+    x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.quantize"><code class="name flex">
+<span>def <span class="ident">quantize</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def quantize(self, x):
+    embed = self.embed.t()
+    dist = -(
+        x.pow(2).sum(1, keepdim=True)
+        - 2 * x @ embed
+        + embed.pow(2).sum(0, keepdim=True)
+    )
+    embed_ind = dist.max(dim=-1).indices
+    return embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.replace_"><code class="name flex">
+<span>def <span class="ident">replace_</span></span>(<span>self, samples, mask)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def replace_(self, samples, mask):
+    modified_codebook = torch.where(
+        mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+    )
+    self.embed.data.copy_(modified_codebook)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization"><code class="flex name class">
+<span>class <span class="ident">ResidualVectorQuantization</span></span>
+<span>(</span><span>*, num_quantizers, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual vector quantization implementation.</p>
+<p>Follows Algorithm 1. in <a href="https://arxiv.org/pdf/2107.03312.pdf">https://arxiv.org/pdf/2107.03312.pdf</a></p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ResidualVectorQuantization(nn.Module):
+    &#34;&#34;&#34;Residual vector quantization implementation.
+
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    &#34;&#34;&#34;
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+
+        all_losses = []
+        all_indices = []
+
+        n_q = n_q or len(self.layers)
+
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            quantized = quantized.detach()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+
+        if self.training:
+            # Solving subtle bug with STE and RVQ: https://github.com/facebookresearch/encodec/issues/25
+            quantized_out = x + (quantized_out - x).detach()
+
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, q_indices: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+    quantized_out = torch.tensor(0.0, device=q_indices.device)
+    for i, indices in enumerate(q_indices):
+        layer = self.layers[i]
+        quantized = layer.decode(indices)
+        quantized_out = quantized_out + quantized
+    return quantized_out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor, n_q: Optional[int] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+    residual = x
+    all_indices = []
+    n_q = n_q or len(self.layers)
+    for layer in self.layers[:n_q]:
+        indices = layer.encode(residual)
+        quantized = layer.decode(indices)
+        residual = residual - quantized
+        all_indices.append(indices)
+    out_indices = torch.stack(all_indices)
+    return out_indices</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x, n_q: Optional[int] = None) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x, n_q: tp.Optional[int] = None):
+    quantized_out = 0.0
+    residual = x
+
+    all_losses = []
+    all_indices = []
+
+    n_q = n_q or len(self.layers)
+
+    for i, layer in enumerate(self.layers[:n_q]):
+        quantized, indices, loss = layer(residual)
+        quantized = quantized.detach()
+        residual = residual - quantized
+        quantized_out = quantized_out + quantized
+        all_indices.append(indices)
+        all_losses.append(loss)
+
+    if self.training:
+        # Solving subtle bug with STE and RVQ: https://github.com/facebookresearch/encodec/issues/25
+        quantized_out = x + (quantized_out - x).detach()
+
+    out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+    return quantized_out, out_indices, out_losses</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization"><code class="flex name class">
+<span>class <span class="ident">VectorQuantization</span></span>
+<span>(</span><span>dim: int, codebook_size: int, codebook_dim: Optional[int] = None, decay: float = 0.8, epsilon: float = 1e-05, kmeans_init: bool = False, kmeans_iters: int = 10, threshold_ema_dead_code: int = 2, channels_last: bool = False, commitment_weight: float = 1.0, orthogonal_reg_weight: float = 0.0, orthogonal_reg_active_codes_only: bool = False, orthogonal_reg_max_codes: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Vector quantization implementation.
+Currently supports only euclidean distance.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension</dd>
+<dt><strong><code>codebook_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size</dd>
+<dt><strong><code>codebook_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook dimension. If not defined, uses the specified dimension in dim.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use kmeans to initialize the codebooks.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for kmeans initialization.</dd>
+<dt>threshold_ema_dead_code (int):</dt>
+<dt><strong><code>channels_last</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Channels are the last dimension in the input tensors.</dd>
+<dt><strong><code>commitment_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Weight for commitment loss.</dd>
+<dt><strong><code>orthogonal_reg_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Orthogonal regularization weights.</dd>
+<dt><strong><code>orthogonal_reg_active_codes_only</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply orthogonal regularization only on active codes.</dd>
+<dt><strong><code>orthogonal_reg_max_codes</code></strong> :&ensp;<code>optional int</code></dt>
+<dd>Maximum number of codes to consider
+for orthogonal regularization.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class VectorQuantization(nn.Module):
+    &#34;&#34;&#34;Vector quantization implementation.
+    Currently supports only euclidean distance.
+
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int):
+        channels_last (bool): Channels are the last dimension in the input tensors.
+        commitment_weight (float): Weight for commitment loss.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
+            for orthogonal regularization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = False,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        channels_last: bool = False,
+        commitment_weight: float = 1.,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+
+        self.channels_last = channels_last
+
+    @property
+    def codebook(self):
+        return self._codebook.embed
+
+    @property
+    def inited(self):
+        return self._codebook.inited
+
+    def _preprocess(self, x):
+        if not self.channels_last:
+            x = rearrange(x, &#34;b d n -&gt; b n d&#34;)
+        return x
+
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            quantize = rearrange(quantize, &#34;b n d -&gt; b d n&#34;)
+        return quantize
+
+    def encode(self, x):
+        x = self._preprocess(x)
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+
+    def forward(self, x):
+        device = x.device
+        x = self._preprocess(x)
+
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+
+        if self.training:
+            quantize = x + (quantize - x).detach()
+
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+        if self.training:
+            if self.commitment_weight &gt; 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+
+            if self.orthogonal_reg_weight &gt; 0:
+                codebook = self.codebook
+
+                if self.orthogonal_reg_active_codes_only:
+                    # only calculate orthogonal loss for the activated codes for this batch
+                    unique_code_ids = torch.unique(embed_ind)
+                    codebook = codebook[unique_code_ids]
+
+                num_codes = codebook.shape[0]
+                if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                    codebook = codebook[rand_ids]
+
+                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+
+        return quantize, embed_ind, loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.codebook"><code class="name">var <span class="ident">codebook</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def codebook(self):
+    return self._codebook.embed</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.inited"><code class="name">var <span class="ident">inited</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def inited(self):
+    return self._codebook.inited</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, embed_ind):
+    quantize = self._codebook.decode(embed_ind)
+    quantize = self.project_out(quantize)
+    quantize = self._postprocess(quantize)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x):
+    x = self._preprocess(x)
+    x = self.project_in(x)
+    embed_in = self._codebook.encode(x)
+    return embed_in</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    device = x.device
+    x = self._preprocess(x)
+
+    x = self.project_in(x)
+    quantize, embed_ind = self._codebook(x)
+
+    if self.training:
+        quantize = x + (quantize - x).detach()
+
+    loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+    if self.training:
+        if self.commitment_weight &gt; 0:
+            commit_loss = F.mse_loss(quantize.detach(), x)
+            loss = loss + commit_loss * self.commitment_weight
+
+        if self.orthogonal_reg_weight &gt; 0:
+            codebook = self.codebook
+
+            if self.orthogonal_reg_active_codes_only:
+                # only calculate orthogonal loss for the activated codes for this batch
+                unique_code_ids = torch.unique(embed_ind)
+                codebook = codebook[unique_code_ids]
+
+            num_codes = codebook.shape[0]
+            if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                codebook = codebook[rand_ids]
+
+            orthogonal_reg_loss = orthogonal_loss_fn(codebook)
+            loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+    quantize = self.project_out(quantize)
+    quantize = self._postprocess(quantize)
+
+    return quantize, embed_ind, loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.default" href="#audiocraft.quantization.core_vq.default">default</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ema_inplace" href="#audiocraft.quantization.core_vq.ema_inplace">ema_inplace</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.exists" href="#audiocraft.quantization.core_vq.exists">exists</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.kmeans" href="#audiocraft.quantization.core_vq.kmeans">kmeans</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.l2norm" href="#audiocraft.quantization.core_vq.l2norm">l2norm</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.laplace_smoothing" href="#audiocraft.quantization.core_vq.laplace_smoothing">laplace_smoothing</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.orthogonal_loss_fn" href="#audiocraft.quantization.core_vq.orthogonal_loss_fn">orthogonal_loss_fn</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.sample_vectors" href="#audiocraft.quantization.core_vq.sample_vectors">sample_vectors</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.uniform_init" href="#audiocraft.quantization.core_vq.uniform_init">uniform_init</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook" href="#audiocraft.quantization.core_vq.EuclideanCodebook">EuclideanCodebook</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init" href="#audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.decode" href="#audiocraft.quantization.core_vq.EuclideanCodebook.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.dequantize" href="#audiocraft.quantization.core_vq.EuclideanCodebook.dequantize">dequantize</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches" href="#audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.encode" href="#audiocraft.quantization.core_vq.EuclideanCodebook.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_">expire_codes_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.forward" href="#audiocraft.quantization.core_vq.EuclideanCodebook.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_">init_embed_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb" href="#audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb">postprocess_emb</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.preprocess" href="#audiocraft.quantization.core_vq.EuclideanCodebook.preprocess">preprocess</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.quantize" href="#audiocraft.quantization.core_vq.EuclideanCodebook.quantize">quantize</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.replace_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.replace_">replace_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.training" href="#audiocraft.quantization.core_vq.EuclideanCodebook.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization">ResidualVectorQuantization</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.decode" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.encode" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.forward" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.training" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.VectorQuantization" href="#audiocraft.quantization.core_vq.VectorQuantization">VectorQuantization</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.call_super_init" href="#audiocraft.quantization.core_vq.VectorQuantization.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.codebook" href="#audiocraft.quantization.core_vq.VectorQuantization.codebook">codebook</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.decode" href="#audiocraft.quantization.core_vq.VectorQuantization.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.dump_patches" href="#audiocraft.quantization.core_vq.VectorQuantization.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.encode" href="#audiocraft.quantization.core_vq.VectorQuantization.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.forward" href="#audiocraft.quantization.core_vq.VectorQuantization.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.inited" href="#audiocraft.quantization.core_vq.VectorQuantization.inited">inited</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.training" href="#audiocraft.quantization.core_vq.VectorQuantization.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/quantization/index.html b/api_docs/audiocraft/quantization/index.html
new file mode 100644
index 00000000..629669b4
--- /dev/null
+++ b/api_docs/audiocraft/quantization/index.html
@@ -0,0 +1,90 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization API documentation</title>
+<meta name="description" content="RVQ." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization</code></h1>
+</header>
+<section id="section-intro">
+<p>RVQ.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;RVQ.&#34;&#34;&#34;
+# flake8: noqa
+from .vq import ResidualVectorQuantizer
+from .base import BaseQuantizer, DummyQuantizer, QuantizedResult</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.quantization.base" href="base.html">audiocraft.quantization.base</a></code></dt>
+<dd>
+<div class="desc"><p>Base class for all quantizers.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization.core_vq" href="core_vq.html">audiocraft.quantization.core_vq</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization.vq" href="vq.html">audiocraft.quantization.vq</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.quantization.base" href="base.html">audiocraft.quantization.base</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq" href="core_vq.html">audiocraft.quantization.core_vq</a></code></li>
+<li><code><a title="audiocraft.quantization.vq" href="vq.html">audiocraft.quantization.vq</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/quantization/vq.html b/api_docs/audiocraft/quantization/vq.html
new file mode 100644
index 00000000..eb2508e2
--- /dev/null
+++ b/api_docs/audiocraft/quantization/vq.html
@@ -0,0 +1,388 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.vq API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.vq</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import typing as tp
+
+import torch
+
+from .base import BaseQuantizer, QuantizedResult
+from .core_vq import ResidualVectorQuantization
+
+
+class ResidualVectorQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Residual Vector Quantizer.
+
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
+            for orthogonal regularization.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        bins: int = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+            orthogonal_reg_weight=self.orthogonal_reg_weight,
+            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
+            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
+            channels_last=False
+        )
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        n_q = self.n_q
+        if self.training and self.q_dropout:
+            n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        &#34;&#34;&#34;
+        n_q = self.n_q
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.&#34;&#34;&#34;
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        return quantized
+
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+
+    @property
+    def num_codebooks(self):
+        return self.n_q
+
+    def set_num_codebooks(self, n: int):
+        assert n &gt; 0 and n &lt;= self.max_n_q
+        self.n_q = n</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer"><code class="flex name class">
+<span>class <span class="ident">ResidualVectorQuantizer</span></span>
+<span>(</span><span>dimension: int = 256, n_q: int = 8, q_dropout: bool = False, bins: int = 1024, decay: float = 0.99, kmeans_init: bool = True, kmeans_iters: int = 10, threshold_ema_dead_code: int = 2, orthogonal_reg_weight: float = 0.0, orthogonal_reg_active_codes_only: bool = False, orthogonal_reg_max_codes: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual Vector Quantizer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the codebooks.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of residual vector quantizers used.</dd>
+<dt><strong><code>q_dropout</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Random quantizer drop out at train time.</dd>
+<dt><strong><code>bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use kmeans to initialize the codebooks.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for kmeans initialization.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+<dt><strong><code>orthogonal_reg_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Orthogonal regularization weights.</dd>
+<dt><strong><code>orthogonal_reg_active_codes_only</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply orthogonal regularization only on active codes.</dd>
+<dt><strong><code>orthogonal_reg_max_codes</code></strong> :&ensp;<code>optional int</code></dt>
+<dd>Maximum number of codes to consider.
+for orthogonal regularization.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ResidualVectorQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Residual Vector Quantizer.
+
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
+            for orthogonal regularization.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        bins: int = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+            orthogonal_reg_weight=self.orthogonal_reg_weight,
+            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
+            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
+            channels_last=False
+        )
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        n_q = self.n_q
+        if self.training and self.q_dropout:
+            n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        &#34;&#34;&#34;
+        n_q = self.n_q
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.&#34;&#34;&#34;
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        return quantized
+
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+
+    @property
+    def num_codebooks(self):
+        return self.n_q
+
+    def set_num_codebooks(self, n: int):
+        assert n &gt; 0 and n &lt;= self.max_n_q
+        self.n_q = n</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.BaseQuantizer" href="base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified frame rate at the given bandwidth.
+The RVQ encode method sets the appropriate number of quantizer to use
+and returns indices for each quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+    The RVQ encode method sets the appropriate number of quantizer to use
+    and returns indices for each quantizer.
+    &#34;&#34;&#34;
+    n_q = self.n_q
+    codes = self.vq.encode(x, n_q=n_q)
+    codes = codes.transpose(0, 1)
+    # codes is [B, K, T], with T frames, K nb of codebooks.
+    return codes</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.quantization.base.BaseQuantizer" href="base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.decode" href="base.html#audiocraft.quantization.base.BaseQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="base.html#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.num_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer" href="#audiocraft.quantization.vq.ResidualVectorQuantizer">ResidualVectorQuantizer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.encode" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.training" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/audiogen.html b/api_docs/audiocraft/solvers/audiogen.html
new file mode 100644
index 00000000..a81c5eee
--- /dev/null
+++ b/api_docs/audiocraft/solvers/audiogen.html
@@ -0,0 +1,166 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.audiogen API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.audiogen</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import builders, musicgen
+
+
+class AudioGenSolver(musicgen.MusicGenSolver):
+    &#34;&#34;&#34;Solver for AudioGen re-implementation training task.
+
+    Note that this implementation does not strictly follows
+    the method proposed in https://arxiv.org/abs/2209.15352
+    but is derived from MusicGen&#39;s training pipeline.
+
+    More information can be found in the AudioGen model card.
+    &#34;&#34;&#34;
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.SOUND</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.audiogen.AudioGenSolver"><code class="flex name class">
+<span>class <span class="ident">AudioGenSolver</span></span>
+<span>(</span><span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Solver for AudioGen re-implementation training task.</p>
+<p>Note that this implementation does not strictly follows
+the method proposed in <a href="https://arxiv.org/abs/2209.15352">https://arxiv.org/abs/2209.15352</a>
+but is derived from MusicGen's training pipeline.</p>
+<p>More information can be found in the AudioGen model card.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioGenSolver(musicgen.MusicGenSolver):
+    &#34;&#34;&#34;Solver for AudioGen re-implementation training task.
+
+    Note that this implementation does not strictly follows
+    the method proposed in https://arxiv.org/abs/2209.15352
+    but is derived from MusicGen&#39;s training pipeline.
+
+    More information can be found in the AudioGen model card.
+    &#34;&#34;&#34;
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.SOUND</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.musicgen.MusicGenSolver" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver">MusicGenSolver</a></li>
+<li><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></li>
+<li>abc.ABC</li>
+<li>flashy.solver.BaseSolver</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.solvers.audiogen.AudioGenSolver.DATASET_TYPE"><code class="name">var <span class="ident">DATASET_TYPE</span> : <a title="audiocraft.solvers.builders.DatasetType" href="builders.html#audiocraft.solvers.builders.DatasetType">DatasetType</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.solvers.musicgen.MusicGenSolver" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver">MusicGenSolver</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.autocast" href="base.html#audiocraft.solvers.base.StandardSolver.autocast">autocast</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.best_metric_name" href="base.html#audiocraft.solvers.base.StandardSolver.best_metric_name">best_metric_name</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.build_dataloaders" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.build_dataloaders">build_dataloaders</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.build_model" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.build_model">build_model</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.commit" href="base.html#audiocraft.solvers.base.StandardSolver.commit">commit</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.common_train_valid" href="base.html#audiocraft.solvers.base.StandardSolver.common_train_valid">common_train_valid</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.evaluate" href="base.html#audiocraft.solvers.base.StandardSolver.evaluate">evaluate</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.evaluate_audio_generation" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.evaluate_audio_generation">evaluate_audio_generation</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.generate" href="base.html#audiocraft.solvers.base.StandardSolver.generate">generate</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.generate_audio" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.generate_audio">generate_audio</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.get_eval_solver_from_sig" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.get_eval_solver_from_sig">get_eval_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.initialize_ema" href="base.html#audiocraft.solvers.base.StandardSolver.initialize_ema">initialize_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.load_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.load_checkpoints">load_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.log_model_summary" href="base.html#audiocraft.solvers.base.StandardSolver.log_model_summary">log_model_summary</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.register_best_state" href="base.html#audiocraft.solvers.base.StandardSolver.register_best_state">register_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.register_ema" href="base.html#audiocraft.solvers.base.StandardSolver.register_ema">register_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.restore" href="base.html#audiocraft.solvers.base.StandardSolver.restore">restore</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run" href="base.html#audiocraft.solvers.base.StandardSolver.run">run</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run_epoch" href="base.html#audiocraft.solvers.base.StandardSolver.run_epoch">run_epoch</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run_generate_step" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.run_generate_step">run_generate_step</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run_one_stage" href="base.html#audiocraft.solvers.base.StandardSolver.run_one_stage">run_one_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run_step" href="base.html#audiocraft.solvers.base.StandardSolver.run_step">run_step</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.save_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.save_checkpoints">save_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.should_run_stage" href="base.html#audiocraft.solvers.base.StandardSolver.should_run_stage">should_run_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.should_stop_training" href="base.html#audiocraft.solvers.base.StandardSolver.should_stop_training">should_stop_training</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.show" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver.show">show</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.train" href="base.html#audiocraft.solvers.base.StandardSolver.train">train</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.update_best_state_from_stage" href="base.html#audiocraft.solvers.base.StandardSolver.update_best_state_from_stage">update_best_state_from_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.valid" href="base.html#audiocraft.solvers.base.StandardSolver.valid">valid</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.audiogen.AudioGenSolver" href="#audiocraft.solvers.audiogen.AudioGenSolver">AudioGenSolver</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.audiogen.AudioGenSolver.DATASET_TYPE" href="#audiocraft.solvers.audiogen.AudioGenSolver.DATASET_TYPE">DATASET_TYPE</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/base.html b/api_docs/audiocraft/solvers/base.html
new file mode 100644
index 00000000..fceae734
--- /dev/null
+++ b/api_docs/audiocraft/solvers/base.html
@@ -0,0 +1,2301 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.base API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.base</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+import typing as tp
+
+import flashy
+import omegaconf
+import torch
+from torch import nn
+
+from .. import optim
+from ..optim import fsdp
+from ..utils import checkpoint
+from ..utils.autocast import TorchAutocast
+from ..utils.best_state import BestStateDictManager
+from ..utils.deadlock import DeadlockDetect
+from ..utils.profiler import Profiler
+from ..utils.utils import copy_state, dict_from_config, model_hash, with_rank_rng
+
+
+class StandardSolver(ABC, flashy.BaseSolver):
+    &#34;&#34;&#34;Standard solver for AudioCraft.
+
+    The standard solver implements a base training loop with the following stages:
+    train, valid, evaluate and generate that are expected to be all defined for
+    solvers in AudioCraft. It also provides a nice default management of Dora history replay,
+    checkpoint management across epoch, and logging configuration.
+
+    AudioCraft solvers must inherit from the StandardSolver and define the methods
+    associated to each stage as well as the show, build_model and build_dataloaders methods.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__()
+        self.logger.info(f&#34;Instantiating solver {self.__class__.__name__} for XP {self.xp.sig}&#34;)
+        self.logger.info(f&#34;All XP logs are stored in {self.xp.folder}&#34;)
+        self.cfg = cfg
+        self.device = cfg.device
+        self.model: nn.Module
+        self._continue_best_source_keys = [&#39;best_state&#39;, &#39;fsdp_best_state&#39;]
+        self._fsdp_modules: tp.List[fsdp.FSDP] = []
+        self._ema_sources: nn.ModuleDict = nn.ModuleDict()
+        self.ema: tp.Optional[optim.ModuleDictEMA] = None
+        self.dataloaders: tp.Dict[str, torch.utils.data.DataLoader] = dict()
+        self._log_updates = self.cfg.logging.get(&#39;log_updates&#39;, 10)
+        if self.cfg.logging.log_tensorboard:
+            self.init_tensorboard(**self.cfg.get(&#39;tensorboard&#39;))
+        if self.cfg.logging.log_wandb and self:
+            self.init_wandb(**self.cfg.get(&#39;wandb&#39;))
+        # keep a copy of the best performing state for stateful objects
+        # used for evaluation and generation stages
+        dtype_best: tp.Optional[torch.dtype] = None
+        if self.cfg.fsdp.use:
+            dtype_best = getattr(torch, self.cfg.fsdp.param_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        elif self.cfg.autocast:
+            dtype_best = getattr(torch, self.cfg.autocast_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        self.best_state: BestStateDictManager = BestStateDictManager(dtype=dtype_best)
+        # Hacky support for keeping a copy of the full best state in rank0.
+        self.fsdp_best_state: tp.Dict[str, tp.Any] = {}
+        self.register_stateful(&#39;best_state&#39;, &#39;fsdp_best_state&#39;)  # register best_state object to keep it in state_dict
+        self._new_best_state: bool = False  # should save a new checkpoint
+        # instantiate datasets and appropriate number of updates per epoch
+        self.build_dataloaders()
+        if self.cfg.execute_only is None:
+            assert &#39;train&#39; in self.dataloaders, &#34;The train dataset split must be provided.&#34;
+            assert &#39;valid&#39; in self.dataloaders, &#34;The valid dataset split must be provided.&#34;
+        self.train_updates_per_epoch = len(self.dataloaders[&#39;train&#39;]) if &#39;train&#39; in self.dataloaders else 0
+        if self.cfg.optim.updates_per_epoch:
+            self.train_updates_per_epoch = self.cfg.optim.updates_per_epoch
+        self.total_updates = self.train_updates_per_epoch * self.cfg.optim.epochs
+        # instantiate model &amp; exponential moving average on the model
+        self.build_model()
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+        assert &#39;model&#39; in self.stateful.sources, \
+            &#34;Please register the model to stateful with self.register_stateful(&#39;model&#39;) in build_model.&#34;
+        self.profiler = Profiler(self.model, **self.cfg.profiler)
+        self.initialize_ema()
+        self.register_stateful(&#39;ema&#39;)
+        assert self.ema is None or &#39;ema&#39; in self.stateful.sources, \
+            &#34;Please register the ema to stateful with self.register_stateful(&#39;ema&#39;) in build_model.&#34;
+        self.deadlock_detect = DeadlockDetect(**self.cfg.deadlock)
+        # basic statistics on the trained model
+        model_size = sum(p.numel() for p in self.model.parameters() if p.requires_grad) / 1e6
+        # one copy of grad, one copy of momentum, one copy of denominator and model weights.
+        # and 4 bytes for each float!
+        mem_usage = model_size * 4 * 4 / 1000
+        self.logger.info(&#34;Model size: %.2f M params&#34;, model_size)
+        self.logger.info(&#34;Base memory usage, with model, grad and optim: %.2f GB&#34;, mem_usage)
+
+    @property
+    def autocast(self):
+        &#34;&#34;&#34;Convenient autocast (or not) using the solver configuration.&#34;&#34;&#34;
+        return TorchAutocast(enabled=self.cfg.autocast, device_type=self.device, dtype=self.autocast_dtype)
+
+    def _get_state_source(self, name) -&gt; flashy.state.StateDictSource:
+        # Internal utility to get a state source from the solver
+        return self.stateful.sources[name]
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        &#34;&#34;&#34;Metric name used to identify the best state. This metric should be stored in the metrics
+        used on the stage for best state identification (most likely, `valid`). If None, then
+        no best state is saved.
+        &#34;&#34;&#34;
+        return None
+
+    def register_best_state(self, *args: str):
+        &#34;&#34;&#34;Register state sources in `BestStateDictManager` to keep their best states along with their
+        latest states. The best state will be used at evaluation stages instead of the latest states.
+
+        Shortcut around `BestStateDictManager.register` method. You can pass any number of
+        attribute, included nested attributes and those will be included into the checkpoints
+        and automatically restored when `BaseSolver.restore` is called.
+        &#34;&#34;&#34;
+        for name in args:
+            state_source = self._get_state_source(name)
+            assert name in self.stateful.sources, &#34;Registered states in best should be registered in stateful first!&#34;
+            self.best_state.register(name, state_source)
+
+    def register_ema(self, *args: str):
+        &#34;&#34;&#34;Register state sources for exponential moving average.
+
+        The registered sources are used to instantiate a ModuleDictEMA instance.
+        The ModuleDictEMA keeps a `nn.ModuleDict` module that is updated when self.ema.step() is called
+        and swapped with the original state sources with self.swap_ema_state() method.
+
+        Usage:
+            self.register_ema(&#39;model&#39;)
+        &#34;&#34;&#34;
+        assert self.ema is None, &#34;Cannot register state source to already instantiated EMA.&#34;
+        for name in args:
+            self._ema_sources[name] = getattr(self, name)
+
+    def wrap_with_fsdp(self, model: torch.nn.Module, *args, **kwargs):
+        model = fsdp.wrap_with_fsdp(self.cfg.fsdp, model, *args, **kwargs)
+        if isinstance(model, fsdp.FSDP):
+            self._fsdp_modules.append(model)
+        return model
+
+    def update_best_state_from_stage(self, stage_name: str = &#39;valid&#39;):
+        &#34;&#34;&#34;Update latest best state based on pending metrics of a given stage. This method relies
+        on the `BestStateDictManager.update` method to update the best state_dict with latest weights
+        if the registered states happen to match to the best performing setup.
+        &#34;&#34;&#34;
+        if self.best_metric_name is None:
+            # when no best metric is defined, the last state is always the best
+            self._new_best_state = True
+            self.logger.info(&#34;Updating best state with current state.&#34;)
+        else:
+            assert stage_name in self._pending_metrics, f&#34;Metrics for stage {stage_name} not found.&#34;
+            assert self.best_metric_name in self._pending_metrics[stage_name], \
+                f&#34;Best metric not found in {stage_name} metrics. Cannot register best state&#34;
+            current_score = self._pending_metrics[stage_name][self.best_metric_name]
+            all_best_metric_scores = [
+                past_metrics[stage_name][self.best_metric_name]
+                for past_metrics in self.history
+            ]
+            all_best_metric_scores.append(current_score)
+            best_score = min(all_best_metric_scores)
+            self._new_best_state = current_score == best_score
+            if self._new_best_state:
+                old_best = min(all_best_metric_scores[:-1] + [float(&#39;inf&#39;)])
+                self.logger.info(
+                    f&#34;New best state with {self.best_metric_name}={current_score:.3f} (was {old_best:.3f})&#34;)
+
+        if self._new_best_state:
+            if self.cfg.fsdp.use:
+                # this will give an empty state dict on all ranks but the rank 0
+                # which will have a copy in memory of the full model.
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    for name in self.best_state.states.keys():
+                        state_source = self._get_state_source(name)
+                        self.best_state.update(name, state_source)
+                    # we save to a different dict.
+                    self.fsdp_best_state.update(self.best_state.state_dict())
+                # We cannot efficiently load fsdp_best_state when using FSDP,
+                # so we have do do a second pass, with the local shards.
+            for name in self.best_state.states.keys():
+                state_source = self._get_state_source(name)
+                self.best_state.update(name, state_source)
+
+    def _load_new_state_dict(self, state_dict: dict) -&gt; dict:
+        old_states = {}
+        for name, new_state in state_dict.items():
+            state_source = self._get_state_source(name)
+            old_states[name] = copy_state(state_source.state_dict())
+            state_source.load_state_dict(new_state)
+        return old_states
+
+    @contextmanager
+    def swap_best_state(self):
+        self.logger.debug(f&#34;Swapping to best state for: {&#39;, &#39;.join(self.best_state.state_dict().keys())}&#34;)
+        old_states = self._load_new_state_dict(self.best_state.state_dict())
+        try:
+            yield
+        finally:
+            self.logger.debug(&#34;Swapping back from best to original state&#34;)
+            for name, old_state in old_states.items():
+                state_source = self._get_state_source(name)
+                state_source.load_state_dict(old_state)
+
+    @contextmanager
+    def swap_ema_state(self):
+        if self.ema is None:
+            yield
+        else:
+            ema_state_dict = self.ema.state_dict()[&#39;state&#39;]
+            self.logger.debug(f&#34;Swapping to EMA state for: {&#39;, &#39;.join(ema_state_dict.keys())}&#34;)
+            old_states = self._load_new_state_dict(ema_state_dict)
+            try:
+                yield
+            finally:
+                self.logger.debug(&#34;Swapping back from EMA state to original state&#34;)
+                for name, old_state in old_states.items():
+                    state_source = self._get_state_source(name)
+                    state_source.load_state_dict(old_state)
+
+    @property
+    def is_training(self):
+        return self.current_stage == &#39;train&#39;
+
+    def log_model_summary(self, model: nn.Module):
+        &#34;&#34;&#34;Log model summary, architecture and size of the model.&#34;&#34;&#34;
+        self.logger.info(model)
+        mb = sum(p.numel() for p in model.parameters()) * 4 / 2 ** 20
+        self.logger.info(&#34;Size: %.1f MB&#34;, mb)
+
+    @abstractmethod
+    def build_model(self):
+        &#34;&#34;&#34;Method to implement to initialize model.&#34;&#34;&#34;
+        ...
+
+    def initialize_ema(self):
+        &#34;&#34;&#34;Initialize exponential moving average with the registered sources.
+        EMA object is created if the optim.ema.model.decay value is non-null.
+        &#34;&#34;&#34;
+        from .builders import get_ema
+        self.ema = get_ema(self._ema_sources, self.cfg.optim.ema)
+        if self.ema is None:
+            self.logger.info(&#39;No EMA on the model.&#39;)
+        else:
+            assert self.cfg.optim.ema.updates &gt; 0
+            self.logger.info(
+                f&#39;Initializing EMA on the model with decay = {self.ema.decay}&#39;
+                f&#39; every {self.cfg.optim.ema.updates} updates&#39;
+            )
+
+    @abstractmethod
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Method to implement to initialize dataloaders.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def show(self):
+        &#34;&#34;&#34;Method to log any information without running the job.&#34;&#34;&#34;
+        ...
+
+    @property
+    def log_updates(self):
+        # convenient access to log updates
+        return self._log_updates
+
+    def checkpoint_path(self, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(**kwargs)
+
+    def epoch_checkpoint_path(self, epoch: int, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(str(epoch), **kwargs)
+
+    def checkpoint_path_with_name(self, name: str, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(name=name, **kwargs)
+
+    def save_checkpoints(self):
+        &#34;&#34;&#34;Save checkpoint, optionally keeping a copy for a given epoch.&#34;&#34;&#34;
+        is_sharded = self.cfg.fsdp.use
+        if not flashy.distrib.is_rank_zero() and not is_sharded:
+            return
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+        state = self.state_dict()
+        epoch = self.epoch - 1  # pushing metrics will increase the epoch in Flashy, so we do -1 here
+
+        # save minimal state_dict as new checkpoint every X epoch
+        if self.cfg.checkpoint.save_every:
+            if epoch % self.cfg.checkpoint.save_every == 0:
+                minimal_state = state
+                if self.cfg.checkpoint.keep_every_states is not None and len(self.cfg.checkpoint.keep_every_states) &gt; 0:
+                    minimal_state = {
+                        name: source for name, source in state.items()
+                        if name in self.cfg.checkpoint.keep_every_states
+                    }
+                epoch_checkpoint_path = self.epoch_checkpoint_path(epoch)
+                checkpoint.save_checkpoint(minimal_state, epoch_checkpoint_path, is_sharded)
+
+        # save checkpoint as latest checkpoint
+        if self.cfg.checkpoint.save_last:
+            last_checkpoint_path = self.checkpoint_path()
+            checkpoint.save_checkpoint(state, last_checkpoint_path, is_sharded)
+
+        # flush any stale checkpoint to reduce disk footprint
+        checkpoint.flush_stale_checkpoints(self.checkpoint_path())
+
+    def load_from_pretrained(self, name: str) -&gt; dict:
+        raise NotImplementedError(&#34;Solver does not provide a way to load pretrained models.&#34;)
+
+    def load_checkpoints(self, load_best: bool = False, ignore_state_keys: tp.List[str] = []) -&gt; tp.Optional[dict]:
+        &#34;&#34;&#34;Load last checkpoint or the one specified in continue_from.
+
+        Args:
+            load_best (bool): Whether to load from best state dict or not.
+                Best state dict is always used when not loading the current xp.
+            ignore_state_keys (list of str): List of sources to ignore when loading the state, e.g. `optimizer`.
+        Returns:
+            state (dict, optional): The loaded state dictionary.
+        &#34;&#34;&#34;
+        # load checkpoints from xp folder or cfg.continue_from
+        is_sharded = self.cfg.fsdp.use
+        load_from_path: tp.Optional[Path] = None
+        checkpoint_source: tp.Optional[checkpoint.CheckpointSource] = None
+
+        if load_best:
+            self.logger.info(&#34;Trying to load state_dict from best state.&#34;)
+
+        state: tp.Optional[dict] = None
+        rank0_checkpoint_path = self.checkpoint_path(use_fsdp=False)
+        current_checkpoint_path = self.checkpoint_path()
+        _pretrained_prefix = &#39;//pretrained/&#39;
+        continue_pretrained = (self.cfg.continue_from or &#39;&#39;).startswith(_pretrained_prefix)
+        if rank0_checkpoint_path.exists():
+            self.logger.info(f&#34;Loading existing checkpoint: {current_checkpoint_path}&#34;)
+            load_from_path = current_checkpoint_path
+            checkpoint.check_sharded_checkpoint(current_checkpoint_path, rank0_checkpoint_path)
+            checkpoint_source = checkpoint.CheckpointSource.CURRENT_XP
+        elif self.cfg.continue_from and not continue_pretrained:
+            self.logger.info(f&#34;Continuing from provided checkpoint: {self.cfg.continue_from}&#34;)
+            # we&#39;re always continuing from consolidated checkpoints: self.cfg.use_fsdp and not continue_best
+            load_from_path = checkpoint.resolve_checkpoint_path(self.cfg.continue_from, use_fsdp=False)
+            if load_from_path is None:
+                self.logger.error(&#39;Could not resolve the continue_from checkpoint %s&#39;, self.cfg.continue_from)
+                raise RuntimeError(f&#39;Could not resolve continue_from checkpoint {self.cfg.continue_from}&#39;)
+            checkpoint_source = checkpoint.CheckpointSource.OTHER
+
+        if load_from_path is not None:
+            state = checkpoint.load_checkpoint(load_from_path, is_sharded)
+        elif continue_pretrained:
+            self.logger.info(&#34;Loading a pretrained model. Ignoring &#39;load_best&#39; and &#39;ignore_state_keys&#39; params.&#34;)
+            state = self.load_from_pretrained(self.cfg.continue_from[len(_pretrained_prefix):])
+            checkpoint_source = checkpoint.CheckpointSource.PRETRAINED
+            load_best = True
+
+        # checkpoints are not from the current xp, we only retrieve the best state
+        if checkpoint_source is not None and checkpoint_source != checkpoint.CheckpointSource.CURRENT_XP:
+            assert state is not None
+            self.logger.info(&#34;Checkpoint source is not the current xp: Load state_dict from best state.&#34;)
+            load_best = True
+            state = {key: state[key] for key in self._continue_best_source_keys if key in state}
+            # loaded checkpoints are FSDP checkpoints: we&#39;re reading the best state
+            # from FSDP and we drop the regular best_state
+            if &#39;fsdp_best_state&#39; in state and state[&#39;fsdp_best_state&#39;]:
+                state.pop(&#39;best_state&#39;, None)
+                self.logger.info(&#34;... Loaded checkpoint has FSDP best state&#34;)
+            # FSDP is enabled in the solver, if the loaded checkpoints do not have FSDP support
+            # then we&#39;re initializing FSDP best state with the regular best state
+            elif self.cfg.fsdp.use:
+                if &#39;fsdp_best_state&#39; not in state or not state[&#39;fsdp_best_state&#39;]:
+                    # we swap non-FSDP checkpoints best_state to FSDP-compatible best state
+                    state[&#39;fsdp_best_state&#39;] = state.pop(&#39;best_state&#39;)
+                    self.logger.info(&#34;... Loaded checkpoint does not have FSDP best state. Use regular best state&#34;)
+
+        if state is not None:
+            if load_best:
+                self.logger.info(&#34;Ignoring keys when loading best %r&#34;, ignore_state_keys)
+                for key in set(ignore_state_keys):
+                    if key in state:
+                        state.pop(key)
+                has_best_state = &#39;best_state&#39; in state or &#39;fsdp_best_state&#39; in state
+                assert has_best_state, (&#34;Trying to load best state but neither &#39;best_state&#39;&#34;,
+                                        &#34; or &#39;fsdp_best_state&#39; found in checkpoints.&#34;)
+            self.load_state_dict(state)
+
+        # for FSDP, let&#39;s make extra sure nothing bad happened with out of sync
+        # checkpoints across workers.
+        epoch = float(self.epoch)
+        avg_epoch = flashy.distrib.average_metrics({&#39;epoch&#39;: epoch})[&#39;epoch&#39;]
+        if avg_epoch != epoch:
+            raise RuntimeError(
+                f&#34;Inconsistent loading of checkpoints happened, our epoch is {epoch} &#34;
+                f&#34;but average of epochs is {avg_epoch}, at least one gpu must have a &#34;
+                &#34;different epoch number.&#34;)
+
+        # on load_best, properly reinitialize state_dict, best states and ema
+        # otherwise we load from the current xp and don&#39;t alter anything
+        if load_best:
+            self.logger.info(&#34;Loading state_dict from best state.&#34;)
+            if not self.cfg.fsdp.use and self.fsdp_best_state:
+                # loading from an FSDP checkpoint but with FSDP deactivated
+                self.logger.info(&#34;... Loading from FSDP best state dict.&#34;)
+                self.best_state.load_state_dict(self.fsdp_best_state)
+
+            # if load_best, we permanently override the regular state_dict with the best state
+            if self.cfg.fsdp.use:
+                self.logger.info(&#34;FSDP is used, loading from FSDP best state.&#34;)
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    # this might be really fragile but okay for now.
+                    self.load_state_dict(self.fsdp_best_state)
+            else:
+                # we permanently swap the stateful objects to their best state
+                self._load_new_state_dict(self.best_state.state_dict())
+
+            # the EMA modules should also be instantiated with best state.
+            # the easiest way to do so is to reinitialize a new EMA with best state loaded.
+            if self.ema is not None:
+                self.logger.info(&#34;Re-initializing EMA from best state&#34;)
+                self.initialize_ema()
+
+            if self.cfg.fsdp.use:
+                self.logger.info(&#34;Re-initializing best state after using FSDP best state.&#34;)
+                for name in self.best_state.states.keys():
+                    state_source = self._get_state_source(name)
+                    self.best_state.update(name, state_source)
+
+        return state
+
+    def restore(self, load_best: bool = False, replay_metrics: bool = False,
+                ignore_state_keys: tp.List[str] = []) -&gt; bool:
+        &#34;&#34;&#34;Restore the status of a solver for a given xp.
+
+        Args:
+            load_best (bool): if `True`, load the best state from the checkpoint.
+            replay_metrics (bool): if `True`, logs all the metrics from past epochs.
+            ignore_state_keys (list of str): list of sources to ignore when loading the state, e.g. `optimizer`.
+        &#34;&#34;&#34;
+        self.logger.info(&#34;Restoring weights and history.&#34;)
+        restored_checkpoints = self.load_checkpoints(load_best, ignore_state_keys)
+
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+
+        if replay_metrics and len(self.history) &gt; 0:
+            self.logger.info(&#34;Replaying past metrics...&#34;)
+            for epoch, stages in enumerate(self.history):
+                for stage_name, metrics in stages.items():
+                    # We manually log the metrics summary to the result logger
+                    # as we don&#39;t want to add them to the pending metrics
+                    self.result_logger._log_summary(stage_name, metrics, step=epoch + 1, step_name=&#39;epoch&#39;,
+                                                    formatter=self.get_formatter(stage_name))
+        return restored_checkpoints is not None
+
+    def commit(self, save_checkpoints: bool = True):
+        &#34;&#34;&#34;Commit metrics to dora and save checkpoints at the end of an epoch.&#34;&#34;&#34;
+        # we override commit to introduce more complex checkpoint saving behaviors
+        self.history.append(self._pending_metrics)  # This will increase self.epoch
+        if save_checkpoints:
+            self.save_checkpoints()
+        self._start_epoch()
+        if flashy.distrib.is_rank_zero():
+            self.xp.link.update_history(self.history)
+
+    def run_epoch(self):
+        &#34;&#34;&#34;Run a single epoch with all stages.
+
+        Metrics for a given stage are stored in _pending_metrics and committed by the solver afterwards.
+        Children solvers can extend this method with custom behavior, e.g.:
+
+            def run_epoch(self):
+                ... # custom code
+                super().run_epoch()
+                ... # custom code
+        &#34;&#34;&#34;
+        self.run_stage(&#39;train&#39;, self.train)
+        with torch.no_grad():
+            with self.swap_ema_state():
+                self.run_stage(&#39;valid&#39;, self.valid)
+                # the best state is updated with EMA states if available
+                self.update_best_state_from_stage(&#39;valid&#39;)
+            with self.swap_best_state():
+                if self.should_run_stage(&#39;evaluate&#39;):
+                    self.run_stage(&#39;evaluate&#39;, self.evaluate)
+                if self.should_run_stage(&#39;generate&#39;):
+                    self.run_stage(&#39;generate&#39;, with_rank_rng()(self.generate))
+
+    def run(self):
+        &#34;&#34;&#34;Training loop.&#34;&#34;&#34;
+        assert len(self.state_dict()) &gt; 0
+        self.restore(replay_metrics=True)  # load checkpoint and replay history
+        self.log_hyperparams(dict_from_config(self.cfg))
+        for epoch in range(self.epoch, self.cfg.optim.epochs + 1):
+            if self.should_stop_training():
+                return
+            self.run_epoch()
+            # Commit will send the metrics to Dora and save checkpoints by default.
+            self.commit()
+
+    def should_stop_training(self) -&gt; bool:
+        &#34;&#34;&#34;Check whether we should stop training or not.&#34;&#34;&#34;
+        return self.epoch &gt; self.cfg.optim.epochs
+
+    def should_run_stage(self, stage_name) -&gt; bool:
+        &#34;&#34;&#34;Check whether we want to run the specified stages.&#34;&#34;&#34;
+        stage_every = self.cfg[stage_name].get(&#39;every&#39;, None)
+        is_last_epoch = self.epoch == self.cfg.optim.epochs
+        is_epoch_every = (stage_every and self.epoch % stage_every == 0)
+        return is_last_epoch or is_epoch_every
+
+    @abstractmethod
+    def run_step(self, idx: int, batch: tp.Any, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        ...
+
+    def common_train_valid(self, dataset_split: str, **kwargs: tp.Any):
+        &#34;&#34;&#34;Common logic for train and valid stages.&#34;&#34;&#34;
+        self.model.train(self.is_training)
+
+        loader = self.dataloaders[dataset_split]
+        # get a different order for distributed training, otherwise this will get ignored
+        if flashy.distrib.world_size() &gt; 1 \
+           and isinstance(loader.sampler, torch.utils.data.distributed.DistributedSampler):
+            loader.sampler.set_epoch(self.epoch)
+        updates_per_epoch = self.train_updates_per_epoch if self.is_training else len(loader)
+        if self.cfg.benchmark_no_load:
+            self.logger.warning(&#34;Fake loading for benchmarking: re-using first batch&#34;)
+            batch = next(iter(loader))
+            loader = [batch] * updates_per_epoch  # type: ignore
+        lp = self.log_progress(self.current_stage, loader, total=updates_per_epoch, updates=self.log_updates)
+        average = flashy.averager()  # epoch wise average
+        instant_average = flashy.averager()  # average between two logging
+        metrics: dict = {}
+
+        with self.profiler, self.deadlock_detect:  # profiler will only run for the first 20 updates.
+            for idx, batch in enumerate(lp):
+                self.deadlock_detect.update(&#39;batch&#39;)
+                if idx &gt;= updates_per_epoch:
+                    break
+                metrics = {}
+                metrics = self.run_step(idx, batch, metrics)
+                self.deadlock_detect.update(&#39;step&#39;)
+                # run EMA step
+                if self.ema is not None and self.is_training and (idx + 1) % self.cfg.optim.ema.updates == 0:
+                    self.logger.debug(&#34;EMA model step&#34;)
+                    self.ema.step()
+                self.deadlock_detect.update(&#39;ema&#39;)
+                self.profiler.step()
+                instant_metrics = instant_average(metrics)
+                if lp.update(**instant_metrics):
+                    instant_average = flashy.averager()  # reset averager between two logging
+                metrics = average(metrics)  # epoch wise average
+                self.deadlock_detect.update(&#39;end_batch&#39;)
+
+        metrics = flashy.distrib.average_metrics(metrics, updates_per_epoch)
+        return metrics
+
+    def train(self):
+        &#34;&#34;&#34;Train stage.&#34;&#34;&#34;
+        return self.common_train_valid(&#39;train&#39;)
+
+    def valid(self):
+        &#34;&#34;&#34;Valid stage.&#34;&#34;&#34;
+        return self.common_train_valid(&#39;valid&#39;)
+
+    @abstractmethod
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        ...
+
+    def run_one_stage(self, stage_name: str):
+        &#34;&#34;&#34;Run only the specified stage.
+        This method is useful to only generate samples from a trained experiment
+        or rerun the validation or evaluation stages.
+        &#34;&#34;&#34;
+        fn = {
+            &#39;generate&#39;: with_rank_rng()(self.generate),
+            &#39;evaluate&#39;: self.evaluate,
+            &#39;valid&#39;: self.valid,
+        }
+        if stage_name not in fn:
+            raise ValueError(f&#39;Trying to run stage {stage_name} is not supported.&#39;)
+        assert len(self.state_dict()) &gt; 0
+        self._start_epoch()
+        with torch.no_grad(), self.swap_best_state():
+            self.run_stage(stage_name, fn[stage_name])
+        if not self.cfg.execute_inplace:
+            self.commit(save_checkpoints=False)
+
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        &#34;&#34;&#34;Mostly a convenience function around audiocraft.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+        and with minimal memory overhead.
+
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+            device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        &#34;&#34;&#34;
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+        our_override_cfg[&#39;autocast&#39;] = autocast
+        if dtype is not None:
+            our_override_cfg[&#39;dtype&#39;] = dtype
+        if device is not None:
+            our_override_cfg[&#39;device&#39;] = device
+        if batch_size is not None:
+            our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+        solver.model.eval()
+        return solver</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.base.StandardSolver"><code class="flex name class">
+<span>class <span class="ident">StandardSolver</span></span>
+<span>(</span><span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Standard solver for AudioCraft.</p>
+<p>The standard solver implements a base training loop with the following stages:
+train, valid, evaluate and generate that are expected to be all defined for
+solvers in AudioCraft. It also provides a nice default management of Dora history replay,
+checkpoint management across epoch, and logging configuration.</p>
+<p>AudioCraft solvers must inherit from the StandardSolver and define the methods
+associated to each stage as well as the show, build_model and build_dataloaders methods.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StandardSolver(ABC, flashy.BaseSolver):
+    &#34;&#34;&#34;Standard solver for AudioCraft.
+
+    The standard solver implements a base training loop with the following stages:
+    train, valid, evaluate and generate that are expected to be all defined for
+    solvers in AudioCraft. It also provides a nice default management of Dora history replay,
+    checkpoint management across epoch, and logging configuration.
+
+    AudioCraft solvers must inherit from the StandardSolver and define the methods
+    associated to each stage as well as the show, build_model and build_dataloaders methods.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__()
+        self.logger.info(f&#34;Instantiating solver {self.__class__.__name__} for XP {self.xp.sig}&#34;)
+        self.logger.info(f&#34;All XP logs are stored in {self.xp.folder}&#34;)
+        self.cfg = cfg
+        self.device = cfg.device
+        self.model: nn.Module
+        self._continue_best_source_keys = [&#39;best_state&#39;, &#39;fsdp_best_state&#39;]
+        self._fsdp_modules: tp.List[fsdp.FSDP] = []
+        self._ema_sources: nn.ModuleDict = nn.ModuleDict()
+        self.ema: tp.Optional[optim.ModuleDictEMA] = None
+        self.dataloaders: tp.Dict[str, torch.utils.data.DataLoader] = dict()
+        self._log_updates = self.cfg.logging.get(&#39;log_updates&#39;, 10)
+        if self.cfg.logging.log_tensorboard:
+            self.init_tensorboard(**self.cfg.get(&#39;tensorboard&#39;))
+        if self.cfg.logging.log_wandb and self:
+            self.init_wandb(**self.cfg.get(&#39;wandb&#39;))
+        # keep a copy of the best performing state for stateful objects
+        # used for evaluation and generation stages
+        dtype_best: tp.Optional[torch.dtype] = None
+        if self.cfg.fsdp.use:
+            dtype_best = getattr(torch, self.cfg.fsdp.param_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        elif self.cfg.autocast:
+            dtype_best = getattr(torch, self.cfg.autocast_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        self.best_state: BestStateDictManager = BestStateDictManager(dtype=dtype_best)
+        # Hacky support for keeping a copy of the full best state in rank0.
+        self.fsdp_best_state: tp.Dict[str, tp.Any] = {}
+        self.register_stateful(&#39;best_state&#39;, &#39;fsdp_best_state&#39;)  # register best_state object to keep it in state_dict
+        self._new_best_state: bool = False  # should save a new checkpoint
+        # instantiate datasets and appropriate number of updates per epoch
+        self.build_dataloaders()
+        if self.cfg.execute_only is None:
+            assert &#39;train&#39; in self.dataloaders, &#34;The train dataset split must be provided.&#34;
+            assert &#39;valid&#39; in self.dataloaders, &#34;The valid dataset split must be provided.&#34;
+        self.train_updates_per_epoch = len(self.dataloaders[&#39;train&#39;]) if &#39;train&#39; in self.dataloaders else 0
+        if self.cfg.optim.updates_per_epoch:
+            self.train_updates_per_epoch = self.cfg.optim.updates_per_epoch
+        self.total_updates = self.train_updates_per_epoch * self.cfg.optim.epochs
+        # instantiate model &amp; exponential moving average on the model
+        self.build_model()
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+        assert &#39;model&#39; in self.stateful.sources, \
+            &#34;Please register the model to stateful with self.register_stateful(&#39;model&#39;) in build_model.&#34;
+        self.profiler = Profiler(self.model, **self.cfg.profiler)
+        self.initialize_ema()
+        self.register_stateful(&#39;ema&#39;)
+        assert self.ema is None or &#39;ema&#39; in self.stateful.sources, \
+            &#34;Please register the ema to stateful with self.register_stateful(&#39;ema&#39;) in build_model.&#34;
+        self.deadlock_detect = DeadlockDetect(**self.cfg.deadlock)
+        # basic statistics on the trained model
+        model_size = sum(p.numel() for p in self.model.parameters() if p.requires_grad) / 1e6
+        # one copy of grad, one copy of momentum, one copy of denominator and model weights.
+        # and 4 bytes for each float!
+        mem_usage = model_size * 4 * 4 / 1000
+        self.logger.info(&#34;Model size: %.2f M params&#34;, model_size)
+        self.logger.info(&#34;Base memory usage, with model, grad and optim: %.2f GB&#34;, mem_usage)
+
+    @property
+    def autocast(self):
+        &#34;&#34;&#34;Convenient autocast (or not) using the solver configuration.&#34;&#34;&#34;
+        return TorchAutocast(enabled=self.cfg.autocast, device_type=self.device, dtype=self.autocast_dtype)
+
+    def _get_state_source(self, name) -&gt; flashy.state.StateDictSource:
+        # Internal utility to get a state source from the solver
+        return self.stateful.sources[name]
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        &#34;&#34;&#34;Metric name used to identify the best state. This metric should be stored in the metrics
+        used on the stage for best state identification (most likely, `valid`). If None, then
+        no best state is saved.
+        &#34;&#34;&#34;
+        return None
+
+    def register_best_state(self, *args: str):
+        &#34;&#34;&#34;Register state sources in `BestStateDictManager` to keep their best states along with their
+        latest states. The best state will be used at evaluation stages instead of the latest states.
+
+        Shortcut around `BestStateDictManager.register` method. You can pass any number of
+        attribute, included nested attributes and those will be included into the checkpoints
+        and automatically restored when `BaseSolver.restore` is called.
+        &#34;&#34;&#34;
+        for name in args:
+            state_source = self._get_state_source(name)
+            assert name in self.stateful.sources, &#34;Registered states in best should be registered in stateful first!&#34;
+            self.best_state.register(name, state_source)
+
+    def register_ema(self, *args: str):
+        &#34;&#34;&#34;Register state sources for exponential moving average.
+
+        The registered sources are used to instantiate a ModuleDictEMA instance.
+        The ModuleDictEMA keeps a `nn.ModuleDict` module that is updated when self.ema.step() is called
+        and swapped with the original state sources with self.swap_ema_state() method.
+
+        Usage:
+            self.register_ema(&#39;model&#39;)
+        &#34;&#34;&#34;
+        assert self.ema is None, &#34;Cannot register state source to already instantiated EMA.&#34;
+        for name in args:
+            self._ema_sources[name] = getattr(self, name)
+
+    def wrap_with_fsdp(self, model: torch.nn.Module, *args, **kwargs):
+        model = fsdp.wrap_with_fsdp(self.cfg.fsdp, model, *args, **kwargs)
+        if isinstance(model, fsdp.FSDP):
+            self._fsdp_modules.append(model)
+        return model
+
+    def update_best_state_from_stage(self, stage_name: str = &#39;valid&#39;):
+        &#34;&#34;&#34;Update latest best state based on pending metrics of a given stage. This method relies
+        on the `BestStateDictManager.update` method to update the best state_dict with latest weights
+        if the registered states happen to match to the best performing setup.
+        &#34;&#34;&#34;
+        if self.best_metric_name is None:
+            # when no best metric is defined, the last state is always the best
+            self._new_best_state = True
+            self.logger.info(&#34;Updating best state with current state.&#34;)
+        else:
+            assert stage_name in self._pending_metrics, f&#34;Metrics for stage {stage_name} not found.&#34;
+            assert self.best_metric_name in self._pending_metrics[stage_name], \
+                f&#34;Best metric not found in {stage_name} metrics. Cannot register best state&#34;
+            current_score = self._pending_metrics[stage_name][self.best_metric_name]
+            all_best_metric_scores = [
+                past_metrics[stage_name][self.best_metric_name]
+                for past_metrics in self.history
+            ]
+            all_best_metric_scores.append(current_score)
+            best_score = min(all_best_metric_scores)
+            self._new_best_state = current_score == best_score
+            if self._new_best_state:
+                old_best = min(all_best_metric_scores[:-1] + [float(&#39;inf&#39;)])
+                self.logger.info(
+                    f&#34;New best state with {self.best_metric_name}={current_score:.3f} (was {old_best:.3f})&#34;)
+
+        if self._new_best_state:
+            if self.cfg.fsdp.use:
+                # this will give an empty state dict on all ranks but the rank 0
+                # which will have a copy in memory of the full model.
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    for name in self.best_state.states.keys():
+                        state_source = self._get_state_source(name)
+                        self.best_state.update(name, state_source)
+                    # we save to a different dict.
+                    self.fsdp_best_state.update(self.best_state.state_dict())
+                # We cannot efficiently load fsdp_best_state when using FSDP,
+                # so we have do do a second pass, with the local shards.
+            for name in self.best_state.states.keys():
+                state_source = self._get_state_source(name)
+                self.best_state.update(name, state_source)
+
+    def _load_new_state_dict(self, state_dict: dict) -&gt; dict:
+        old_states = {}
+        for name, new_state in state_dict.items():
+            state_source = self._get_state_source(name)
+            old_states[name] = copy_state(state_source.state_dict())
+            state_source.load_state_dict(new_state)
+        return old_states
+
+    @contextmanager
+    def swap_best_state(self):
+        self.logger.debug(f&#34;Swapping to best state for: {&#39;, &#39;.join(self.best_state.state_dict().keys())}&#34;)
+        old_states = self._load_new_state_dict(self.best_state.state_dict())
+        try:
+            yield
+        finally:
+            self.logger.debug(&#34;Swapping back from best to original state&#34;)
+            for name, old_state in old_states.items():
+                state_source = self._get_state_source(name)
+                state_source.load_state_dict(old_state)
+
+    @contextmanager
+    def swap_ema_state(self):
+        if self.ema is None:
+            yield
+        else:
+            ema_state_dict = self.ema.state_dict()[&#39;state&#39;]
+            self.logger.debug(f&#34;Swapping to EMA state for: {&#39;, &#39;.join(ema_state_dict.keys())}&#34;)
+            old_states = self._load_new_state_dict(ema_state_dict)
+            try:
+                yield
+            finally:
+                self.logger.debug(&#34;Swapping back from EMA state to original state&#34;)
+                for name, old_state in old_states.items():
+                    state_source = self._get_state_source(name)
+                    state_source.load_state_dict(old_state)
+
+    @property
+    def is_training(self):
+        return self.current_stage == &#39;train&#39;
+
+    def log_model_summary(self, model: nn.Module):
+        &#34;&#34;&#34;Log model summary, architecture and size of the model.&#34;&#34;&#34;
+        self.logger.info(model)
+        mb = sum(p.numel() for p in model.parameters()) * 4 / 2 ** 20
+        self.logger.info(&#34;Size: %.1f MB&#34;, mb)
+
+    @abstractmethod
+    def build_model(self):
+        &#34;&#34;&#34;Method to implement to initialize model.&#34;&#34;&#34;
+        ...
+
+    def initialize_ema(self):
+        &#34;&#34;&#34;Initialize exponential moving average with the registered sources.
+        EMA object is created if the optim.ema.model.decay value is non-null.
+        &#34;&#34;&#34;
+        from .builders import get_ema
+        self.ema = get_ema(self._ema_sources, self.cfg.optim.ema)
+        if self.ema is None:
+            self.logger.info(&#39;No EMA on the model.&#39;)
+        else:
+            assert self.cfg.optim.ema.updates &gt; 0
+            self.logger.info(
+                f&#39;Initializing EMA on the model with decay = {self.ema.decay}&#39;
+                f&#39; every {self.cfg.optim.ema.updates} updates&#39;
+            )
+
+    @abstractmethod
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Method to implement to initialize dataloaders.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def show(self):
+        &#34;&#34;&#34;Method to log any information without running the job.&#34;&#34;&#34;
+        ...
+
+    @property
+    def log_updates(self):
+        # convenient access to log updates
+        return self._log_updates
+
+    def checkpoint_path(self, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(**kwargs)
+
+    def epoch_checkpoint_path(self, epoch: int, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(str(epoch), **kwargs)
+
+    def checkpoint_path_with_name(self, name: str, **kwargs):
+        kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(name=name, **kwargs)
+
+    def save_checkpoints(self):
+        &#34;&#34;&#34;Save checkpoint, optionally keeping a copy for a given epoch.&#34;&#34;&#34;
+        is_sharded = self.cfg.fsdp.use
+        if not flashy.distrib.is_rank_zero() and not is_sharded:
+            return
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+        state = self.state_dict()
+        epoch = self.epoch - 1  # pushing metrics will increase the epoch in Flashy, so we do -1 here
+
+        # save minimal state_dict as new checkpoint every X epoch
+        if self.cfg.checkpoint.save_every:
+            if epoch % self.cfg.checkpoint.save_every == 0:
+                minimal_state = state
+                if self.cfg.checkpoint.keep_every_states is not None and len(self.cfg.checkpoint.keep_every_states) &gt; 0:
+                    minimal_state = {
+                        name: source for name, source in state.items()
+                        if name in self.cfg.checkpoint.keep_every_states
+                    }
+                epoch_checkpoint_path = self.epoch_checkpoint_path(epoch)
+                checkpoint.save_checkpoint(minimal_state, epoch_checkpoint_path, is_sharded)
+
+        # save checkpoint as latest checkpoint
+        if self.cfg.checkpoint.save_last:
+            last_checkpoint_path = self.checkpoint_path()
+            checkpoint.save_checkpoint(state, last_checkpoint_path, is_sharded)
+
+        # flush any stale checkpoint to reduce disk footprint
+        checkpoint.flush_stale_checkpoints(self.checkpoint_path())
+
+    def load_from_pretrained(self, name: str) -&gt; dict:
+        raise NotImplementedError(&#34;Solver does not provide a way to load pretrained models.&#34;)
+
+    def load_checkpoints(self, load_best: bool = False, ignore_state_keys: tp.List[str] = []) -&gt; tp.Optional[dict]:
+        &#34;&#34;&#34;Load last checkpoint or the one specified in continue_from.
+
+        Args:
+            load_best (bool): Whether to load from best state dict or not.
+                Best state dict is always used when not loading the current xp.
+            ignore_state_keys (list of str): List of sources to ignore when loading the state, e.g. `optimizer`.
+        Returns:
+            state (dict, optional): The loaded state dictionary.
+        &#34;&#34;&#34;
+        # load checkpoints from xp folder or cfg.continue_from
+        is_sharded = self.cfg.fsdp.use
+        load_from_path: tp.Optional[Path] = None
+        checkpoint_source: tp.Optional[checkpoint.CheckpointSource] = None
+
+        if load_best:
+            self.logger.info(&#34;Trying to load state_dict from best state.&#34;)
+
+        state: tp.Optional[dict] = None
+        rank0_checkpoint_path = self.checkpoint_path(use_fsdp=False)
+        current_checkpoint_path = self.checkpoint_path()
+        _pretrained_prefix = &#39;//pretrained/&#39;
+        continue_pretrained = (self.cfg.continue_from or &#39;&#39;).startswith(_pretrained_prefix)
+        if rank0_checkpoint_path.exists():
+            self.logger.info(f&#34;Loading existing checkpoint: {current_checkpoint_path}&#34;)
+            load_from_path = current_checkpoint_path
+            checkpoint.check_sharded_checkpoint(current_checkpoint_path, rank0_checkpoint_path)
+            checkpoint_source = checkpoint.CheckpointSource.CURRENT_XP
+        elif self.cfg.continue_from and not continue_pretrained:
+            self.logger.info(f&#34;Continuing from provided checkpoint: {self.cfg.continue_from}&#34;)
+            # we&#39;re always continuing from consolidated checkpoints: self.cfg.use_fsdp and not continue_best
+            load_from_path = checkpoint.resolve_checkpoint_path(self.cfg.continue_from, use_fsdp=False)
+            if load_from_path is None:
+                self.logger.error(&#39;Could not resolve the continue_from checkpoint %s&#39;, self.cfg.continue_from)
+                raise RuntimeError(f&#39;Could not resolve continue_from checkpoint {self.cfg.continue_from}&#39;)
+            checkpoint_source = checkpoint.CheckpointSource.OTHER
+
+        if load_from_path is not None:
+            state = checkpoint.load_checkpoint(load_from_path, is_sharded)
+        elif continue_pretrained:
+            self.logger.info(&#34;Loading a pretrained model. Ignoring &#39;load_best&#39; and &#39;ignore_state_keys&#39; params.&#34;)
+            state = self.load_from_pretrained(self.cfg.continue_from[len(_pretrained_prefix):])
+            checkpoint_source = checkpoint.CheckpointSource.PRETRAINED
+            load_best = True
+
+        # checkpoints are not from the current xp, we only retrieve the best state
+        if checkpoint_source is not None and checkpoint_source != checkpoint.CheckpointSource.CURRENT_XP:
+            assert state is not None
+            self.logger.info(&#34;Checkpoint source is not the current xp: Load state_dict from best state.&#34;)
+            load_best = True
+            state = {key: state[key] for key in self._continue_best_source_keys if key in state}
+            # loaded checkpoints are FSDP checkpoints: we&#39;re reading the best state
+            # from FSDP and we drop the regular best_state
+            if &#39;fsdp_best_state&#39; in state and state[&#39;fsdp_best_state&#39;]:
+                state.pop(&#39;best_state&#39;, None)
+                self.logger.info(&#34;... Loaded checkpoint has FSDP best state&#34;)
+            # FSDP is enabled in the solver, if the loaded checkpoints do not have FSDP support
+            # then we&#39;re initializing FSDP best state with the regular best state
+            elif self.cfg.fsdp.use:
+                if &#39;fsdp_best_state&#39; not in state or not state[&#39;fsdp_best_state&#39;]:
+                    # we swap non-FSDP checkpoints best_state to FSDP-compatible best state
+                    state[&#39;fsdp_best_state&#39;] = state.pop(&#39;best_state&#39;)
+                    self.logger.info(&#34;... Loaded checkpoint does not have FSDP best state. Use regular best state&#34;)
+
+        if state is not None:
+            if load_best:
+                self.logger.info(&#34;Ignoring keys when loading best %r&#34;, ignore_state_keys)
+                for key in set(ignore_state_keys):
+                    if key in state:
+                        state.pop(key)
+                has_best_state = &#39;best_state&#39; in state or &#39;fsdp_best_state&#39; in state
+                assert has_best_state, (&#34;Trying to load best state but neither &#39;best_state&#39;&#34;,
+                                        &#34; or &#39;fsdp_best_state&#39; found in checkpoints.&#34;)
+            self.load_state_dict(state)
+
+        # for FSDP, let&#39;s make extra sure nothing bad happened with out of sync
+        # checkpoints across workers.
+        epoch = float(self.epoch)
+        avg_epoch = flashy.distrib.average_metrics({&#39;epoch&#39;: epoch})[&#39;epoch&#39;]
+        if avg_epoch != epoch:
+            raise RuntimeError(
+                f&#34;Inconsistent loading of checkpoints happened, our epoch is {epoch} &#34;
+                f&#34;but average of epochs is {avg_epoch}, at least one gpu must have a &#34;
+                &#34;different epoch number.&#34;)
+
+        # on load_best, properly reinitialize state_dict, best states and ema
+        # otherwise we load from the current xp and don&#39;t alter anything
+        if load_best:
+            self.logger.info(&#34;Loading state_dict from best state.&#34;)
+            if not self.cfg.fsdp.use and self.fsdp_best_state:
+                # loading from an FSDP checkpoint but with FSDP deactivated
+                self.logger.info(&#34;... Loading from FSDP best state dict.&#34;)
+                self.best_state.load_state_dict(self.fsdp_best_state)
+
+            # if load_best, we permanently override the regular state_dict with the best state
+            if self.cfg.fsdp.use:
+                self.logger.info(&#34;FSDP is used, loading from FSDP best state.&#34;)
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    # this might be really fragile but okay for now.
+                    self.load_state_dict(self.fsdp_best_state)
+            else:
+                # we permanently swap the stateful objects to their best state
+                self._load_new_state_dict(self.best_state.state_dict())
+
+            # the EMA modules should also be instantiated with best state.
+            # the easiest way to do so is to reinitialize a new EMA with best state loaded.
+            if self.ema is not None:
+                self.logger.info(&#34;Re-initializing EMA from best state&#34;)
+                self.initialize_ema()
+
+            if self.cfg.fsdp.use:
+                self.logger.info(&#34;Re-initializing best state after using FSDP best state.&#34;)
+                for name in self.best_state.states.keys():
+                    state_source = self._get_state_source(name)
+                    self.best_state.update(name, state_source)
+
+        return state
+
+    def restore(self, load_best: bool = False, replay_metrics: bool = False,
+                ignore_state_keys: tp.List[str] = []) -&gt; bool:
+        &#34;&#34;&#34;Restore the status of a solver for a given xp.
+
+        Args:
+            load_best (bool): if `True`, load the best state from the checkpoint.
+            replay_metrics (bool): if `True`, logs all the metrics from past epochs.
+            ignore_state_keys (list of str): list of sources to ignore when loading the state, e.g. `optimizer`.
+        &#34;&#34;&#34;
+        self.logger.info(&#34;Restoring weights and history.&#34;)
+        restored_checkpoints = self.load_checkpoints(load_best, ignore_state_keys)
+
+        self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+
+        if replay_metrics and len(self.history) &gt; 0:
+            self.logger.info(&#34;Replaying past metrics...&#34;)
+            for epoch, stages in enumerate(self.history):
+                for stage_name, metrics in stages.items():
+                    # We manually log the metrics summary to the result logger
+                    # as we don&#39;t want to add them to the pending metrics
+                    self.result_logger._log_summary(stage_name, metrics, step=epoch + 1, step_name=&#39;epoch&#39;,
+                                                    formatter=self.get_formatter(stage_name))
+        return restored_checkpoints is not None
+
+    def commit(self, save_checkpoints: bool = True):
+        &#34;&#34;&#34;Commit metrics to dora and save checkpoints at the end of an epoch.&#34;&#34;&#34;
+        # we override commit to introduce more complex checkpoint saving behaviors
+        self.history.append(self._pending_metrics)  # This will increase self.epoch
+        if save_checkpoints:
+            self.save_checkpoints()
+        self._start_epoch()
+        if flashy.distrib.is_rank_zero():
+            self.xp.link.update_history(self.history)
+
+    def run_epoch(self):
+        &#34;&#34;&#34;Run a single epoch with all stages.
+
+        Metrics for a given stage are stored in _pending_metrics and committed by the solver afterwards.
+        Children solvers can extend this method with custom behavior, e.g.:
+
+            def run_epoch(self):
+                ... # custom code
+                super().run_epoch()
+                ... # custom code
+        &#34;&#34;&#34;
+        self.run_stage(&#39;train&#39;, self.train)
+        with torch.no_grad():
+            with self.swap_ema_state():
+                self.run_stage(&#39;valid&#39;, self.valid)
+                # the best state is updated with EMA states if available
+                self.update_best_state_from_stage(&#39;valid&#39;)
+            with self.swap_best_state():
+                if self.should_run_stage(&#39;evaluate&#39;):
+                    self.run_stage(&#39;evaluate&#39;, self.evaluate)
+                if self.should_run_stage(&#39;generate&#39;):
+                    self.run_stage(&#39;generate&#39;, with_rank_rng()(self.generate))
+
+    def run(self):
+        &#34;&#34;&#34;Training loop.&#34;&#34;&#34;
+        assert len(self.state_dict()) &gt; 0
+        self.restore(replay_metrics=True)  # load checkpoint and replay history
+        self.log_hyperparams(dict_from_config(self.cfg))
+        for epoch in range(self.epoch, self.cfg.optim.epochs + 1):
+            if self.should_stop_training():
+                return
+            self.run_epoch()
+            # Commit will send the metrics to Dora and save checkpoints by default.
+            self.commit()
+
+    def should_stop_training(self) -&gt; bool:
+        &#34;&#34;&#34;Check whether we should stop training or not.&#34;&#34;&#34;
+        return self.epoch &gt; self.cfg.optim.epochs
+
+    def should_run_stage(self, stage_name) -&gt; bool:
+        &#34;&#34;&#34;Check whether we want to run the specified stages.&#34;&#34;&#34;
+        stage_every = self.cfg[stage_name].get(&#39;every&#39;, None)
+        is_last_epoch = self.epoch == self.cfg.optim.epochs
+        is_epoch_every = (stage_every and self.epoch % stage_every == 0)
+        return is_last_epoch or is_epoch_every
+
+    @abstractmethod
+    def run_step(self, idx: int, batch: tp.Any, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        ...
+
+    def common_train_valid(self, dataset_split: str, **kwargs: tp.Any):
+        &#34;&#34;&#34;Common logic for train and valid stages.&#34;&#34;&#34;
+        self.model.train(self.is_training)
+
+        loader = self.dataloaders[dataset_split]
+        # get a different order for distributed training, otherwise this will get ignored
+        if flashy.distrib.world_size() &gt; 1 \
+           and isinstance(loader.sampler, torch.utils.data.distributed.DistributedSampler):
+            loader.sampler.set_epoch(self.epoch)
+        updates_per_epoch = self.train_updates_per_epoch if self.is_training else len(loader)
+        if self.cfg.benchmark_no_load:
+            self.logger.warning(&#34;Fake loading for benchmarking: re-using first batch&#34;)
+            batch = next(iter(loader))
+            loader = [batch] * updates_per_epoch  # type: ignore
+        lp = self.log_progress(self.current_stage, loader, total=updates_per_epoch, updates=self.log_updates)
+        average = flashy.averager()  # epoch wise average
+        instant_average = flashy.averager()  # average between two logging
+        metrics: dict = {}
+
+        with self.profiler, self.deadlock_detect:  # profiler will only run for the first 20 updates.
+            for idx, batch in enumerate(lp):
+                self.deadlock_detect.update(&#39;batch&#39;)
+                if idx &gt;= updates_per_epoch:
+                    break
+                metrics = {}
+                metrics = self.run_step(idx, batch, metrics)
+                self.deadlock_detect.update(&#39;step&#39;)
+                # run EMA step
+                if self.ema is not None and self.is_training and (idx + 1) % self.cfg.optim.ema.updates == 0:
+                    self.logger.debug(&#34;EMA model step&#34;)
+                    self.ema.step()
+                self.deadlock_detect.update(&#39;ema&#39;)
+                self.profiler.step()
+                instant_metrics = instant_average(metrics)
+                if lp.update(**instant_metrics):
+                    instant_average = flashy.averager()  # reset averager between two logging
+                metrics = average(metrics)  # epoch wise average
+                self.deadlock_detect.update(&#39;end_batch&#39;)
+
+        metrics = flashy.distrib.average_metrics(metrics, updates_per_epoch)
+        return metrics
+
+    def train(self):
+        &#34;&#34;&#34;Train stage.&#34;&#34;&#34;
+        return self.common_train_valid(&#39;train&#39;)
+
+    def valid(self):
+        &#34;&#34;&#34;Valid stage.&#34;&#34;&#34;
+        return self.common_train_valid(&#39;valid&#39;)
+
+    @abstractmethod
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage.&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        ...
+
+    def run_one_stage(self, stage_name: str):
+        &#34;&#34;&#34;Run only the specified stage.
+        This method is useful to only generate samples from a trained experiment
+        or rerun the validation or evaluation stages.
+        &#34;&#34;&#34;
+        fn = {
+            &#39;generate&#39;: with_rank_rng()(self.generate),
+            &#39;evaluate&#39;: self.evaluate,
+            &#39;valid&#39;: self.valid,
+        }
+        if stage_name not in fn:
+            raise ValueError(f&#39;Trying to run stage {stage_name} is not supported.&#39;)
+        assert len(self.state_dict()) &gt; 0
+        self._start_epoch()
+        with torch.no_grad(), self.swap_best_state():
+            self.run_stage(stage_name, fn[stage_name])
+        if not self.cfg.execute_inplace:
+            self.commit(save_checkpoints=False)
+
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        &#34;&#34;&#34;Mostly a convenience function around audiocraft.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+        and with minimal memory overhead.
+
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+            device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        &#34;&#34;&#34;
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+        our_override_cfg[&#39;autocast&#39;] = autocast
+        if dtype is not None:
+            our_override_cfg[&#39;dtype&#39;] = dtype
+        if device is not None:
+            our_override_cfg[&#39;device&#39;] = device
+        if batch_size is not None:
+            our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+        solver.model.eval()
+        return solver</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+<li>flashy.solver.BaseSolver</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.compression.CompressionSolver" href="compression.html#audiocraft.solvers.compression.CompressionSolver">CompressionSolver</a></li>
+<li><a title="audiocraft.solvers.diffusion.DiffusionSolver" href="diffusion.html#audiocraft.solvers.diffusion.DiffusionSolver">DiffusionSolver</a></li>
+<li><a title="audiocraft.solvers.musicgen.MusicGenSolver" href="musicgen.html#audiocraft.solvers.musicgen.MusicGenSolver">MusicGenSolver</a></li>
+</ul>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig"><code class="name flex">
+<span>def <span class="ident">get_eval_solver_from_sig</span></span>(<span>sig: str, dtype: Optional[str] = None, device: Optional[str] = None, autocast: bool = True, batch_size: Optional[int] = None, override_cfg: Union[dict, omegaconf.dictconfig.DictConfig, None] = None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Mostly a convenience function around audiocraft.train.get_solver_from_sig,
+populating all the proper param, deactivating EMA, FSDP, loading the best state,
+basically all you need to get a solver ready to "play" with in single GPU mode
+and with minimal memory overhead.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sig</code></strong> :&ensp;<code>str</code></dt>
+<dd>signature to load.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>str</code> or <code>None</code></dt>
+<dd>potential dtype, as a string, i.e. 'float16'.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code> or <code>None</code></dt>
+<dd>potential device, as a string, i.e. 'cuda'.</dd>
+<dt><strong><code>override_cfg</code></strong> :&ensp;<code>dict</code> or <code>omegaconf.DictConfig</code> or <code>None</code></dt>
+<dd>potential device, as a string, i.e. 'cuda'.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                             device: tp.Optional[str] = None, autocast: bool = True,
+                             batch_size: tp.Optional[int] = None,
+                             override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                             **kwargs):
+    &#34;&#34;&#34;Mostly a convenience function around audiocraft.train.get_solver_from_sig,
+    populating all the proper param, deactivating EMA, FSDP, loading the best state,
+    basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+    and with minimal memory overhead.
+
+    Args:
+        sig (str): signature to load.
+        dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+        device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+    &#34;&#34;&#34;
+    from audiocraft import train
+    our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+    our_override_cfg[&#39;autocast&#39;] = autocast
+    if dtype is not None:
+        our_override_cfg[&#39;dtype&#39;] = dtype
+    if device is not None:
+        our_override_cfg[&#39;device&#39;] = device
+    if batch_size is not None:
+        our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+    if override_cfg is None:
+        override_cfg = {}
+    override_cfg = omegaconf.OmegaConf.merge(
+        omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+    solver = train.get_solver_from_sig(
+        sig, override_cfg=override_cfg,
+        load_best=True, disable_fsdp=True,
+        ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+    solver.model.eval()
+    return solver</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.solvers.base.StandardSolver.autocast"><code class="name">var <span class="ident">autocast</span></code></dt>
+<dd>
+<div class="desc"><p>Convenient autocast (or not) using the solver configuration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def autocast(self):
+    &#34;&#34;&#34;Convenient autocast (or not) using the solver configuration.&#34;&#34;&#34;
+    return TorchAutocast(enabled=self.cfg.autocast, device_type=self.device, dtype=self.autocast_dtype)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.best_metric_name"><code class="name">var <span class="ident">best_metric_name</span> : Optional[str]</code></dt>
+<dd>
+<div class="desc"><p>Metric name used to identify the best state. This metric should be stored in the metrics
+used on the stage for best state identification (most likely, <code>valid</code>). If None, then
+no best state is saved.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def best_metric_name(self) -&gt; tp.Optional[str]:
+    &#34;&#34;&#34;Metric name used to identify the best state. This metric should be stored in the metrics
+    used on the stage for best state identification (most likely, `valid`). If None, then
+    no best state is saved.
+    &#34;&#34;&#34;
+    return None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.is_training"><code class="name">var <span class="ident">is_training</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def is_training(self):
+    return self.current_stage == &#39;train&#39;</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.log_updates"><code class="name">var <span class="ident">log_updates</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def log_updates(self):
+    # convenient access to log updates
+    return self._log_updates</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.solvers.base.StandardSolver.build_dataloaders"><code class="name flex">
+<span>def <span class="ident">build_dataloaders</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Method to implement to initialize dataloaders.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def build_dataloaders(self):
+    &#34;&#34;&#34;Method to implement to initialize dataloaders.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.build_model"><code class="name flex">
+<span>def <span class="ident">build_model</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Method to implement to initialize model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def build_model(self):
+    &#34;&#34;&#34;Method to implement to initialize model.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.checkpoint_path"><code class="name flex">
+<span>def <span class="ident">checkpoint_path</span></span>(<span>self, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def checkpoint_path(self, **kwargs):
+    kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+    return self.folder / checkpoint.checkpoint_name(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.checkpoint_path_with_name"><code class="name flex">
+<span>def <span class="ident">checkpoint_path_with_name</span></span>(<span>self, name: str, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def checkpoint_path_with_name(self, name: str, **kwargs):
+    kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+    return self.folder / checkpoint.checkpoint_name(name=name, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.commit"><code class="name flex">
+<span>def <span class="ident">commit</span></span>(<span>self, save_checkpoints: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Commit metrics to dora and save checkpoints at the end of an epoch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def commit(self, save_checkpoints: bool = True):
+    &#34;&#34;&#34;Commit metrics to dora and save checkpoints at the end of an epoch.&#34;&#34;&#34;
+    # we override commit to introduce more complex checkpoint saving behaviors
+    self.history.append(self._pending_metrics)  # This will increase self.epoch
+    if save_checkpoints:
+        self.save_checkpoints()
+    self._start_epoch()
+    if flashy.distrib.is_rank_zero():
+        self.xp.link.update_history(self.history)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.common_train_valid"><code class="name flex">
+<span>def <span class="ident">common_train_valid</span></span>(<span>self, dataset_split: str, **kwargs: Any)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Common logic for train and valid stages.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def common_train_valid(self, dataset_split: str, **kwargs: tp.Any):
+    &#34;&#34;&#34;Common logic for train and valid stages.&#34;&#34;&#34;
+    self.model.train(self.is_training)
+
+    loader = self.dataloaders[dataset_split]
+    # get a different order for distributed training, otherwise this will get ignored
+    if flashy.distrib.world_size() &gt; 1 \
+       and isinstance(loader.sampler, torch.utils.data.distributed.DistributedSampler):
+        loader.sampler.set_epoch(self.epoch)
+    updates_per_epoch = self.train_updates_per_epoch if self.is_training else len(loader)
+    if self.cfg.benchmark_no_load:
+        self.logger.warning(&#34;Fake loading for benchmarking: re-using first batch&#34;)
+        batch = next(iter(loader))
+        loader = [batch] * updates_per_epoch  # type: ignore
+    lp = self.log_progress(self.current_stage, loader, total=updates_per_epoch, updates=self.log_updates)
+    average = flashy.averager()  # epoch wise average
+    instant_average = flashy.averager()  # average between two logging
+    metrics: dict = {}
+
+    with self.profiler, self.deadlock_detect:  # profiler will only run for the first 20 updates.
+        for idx, batch in enumerate(lp):
+            self.deadlock_detect.update(&#39;batch&#39;)
+            if idx &gt;= updates_per_epoch:
+                break
+            metrics = {}
+            metrics = self.run_step(idx, batch, metrics)
+            self.deadlock_detect.update(&#39;step&#39;)
+            # run EMA step
+            if self.ema is not None and self.is_training and (idx + 1) % self.cfg.optim.ema.updates == 0:
+                self.logger.debug(&#34;EMA model step&#34;)
+                self.ema.step()
+            self.deadlock_detect.update(&#39;ema&#39;)
+            self.profiler.step()
+            instant_metrics = instant_average(metrics)
+            if lp.update(**instant_metrics):
+                instant_average = flashy.averager()  # reset averager between two logging
+            metrics = average(metrics)  # epoch wise average
+            self.deadlock_detect.update(&#39;end_batch&#39;)
+
+    metrics = flashy.distrib.average_metrics(metrics, updates_per_epoch)
+    return metrics</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.epoch_checkpoint_path"><code class="name flex">
+<span>def <span class="ident">epoch_checkpoint_path</span></span>(<span>self, epoch: int, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def epoch_checkpoint_path(self, epoch: int, **kwargs):
+    kwargs.setdefault(&#39;use_fsdp&#39;, self.cfg.fsdp.use)
+    return self.folder / checkpoint.checkpoint_name(str(epoch), **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.evaluate"><code class="name flex">
+<span>def <span class="ident">evaluate</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Evaluate stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def evaluate(self):
+    &#34;&#34;&#34;Evaluate stage.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def generate(self):
+    &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.initialize_ema"><code class="name flex">
+<span>def <span class="ident">initialize_ema</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Initialize exponential moving average with the registered sources.
+EMA object is created if the optim.ema.model.decay value is non-null.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def initialize_ema(self):
+    &#34;&#34;&#34;Initialize exponential moving average with the registered sources.
+    EMA object is created if the optim.ema.model.decay value is non-null.
+    &#34;&#34;&#34;
+    from .builders import get_ema
+    self.ema = get_ema(self._ema_sources, self.cfg.optim.ema)
+    if self.ema is None:
+        self.logger.info(&#39;No EMA on the model.&#39;)
+    else:
+        assert self.cfg.optim.ema.updates &gt; 0
+        self.logger.info(
+            f&#39;Initializing EMA on the model with decay = {self.ema.decay}&#39;
+            f&#39; every {self.cfg.optim.ema.updates} updates&#39;
+        )</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.load_checkpoints"><code class="name flex">
+<span>def <span class="ident">load_checkpoints</span></span>(<span>self, load_best: bool = False, ignore_state_keys: List[str] = []) ‑> Optional[dict]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Load last checkpoint or the one specified in continue_from.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>load_best</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to load from best state dict or not.
+Best state dict is always used when not loading the current xp.</dd>
+<dt><strong><code>ignore_state_keys</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>List of sources to ignore when loading the state, e.g. <code>optimizer</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>state (dict, optional): The loaded state dictionary.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_checkpoints(self, load_best: bool = False, ignore_state_keys: tp.List[str] = []) -&gt; tp.Optional[dict]:
+    &#34;&#34;&#34;Load last checkpoint or the one specified in continue_from.
+
+    Args:
+        load_best (bool): Whether to load from best state dict or not.
+            Best state dict is always used when not loading the current xp.
+        ignore_state_keys (list of str): List of sources to ignore when loading the state, e.g. `optimizer`.
+    Returns:
+        state (dict, optional): The loaded state dictionary.
+    &#34;&#34;&#34;
+    # load checkpoints from xp folder or cfg.continue_from
+    is_sharded = self.cfg.fsdp.use
+    load_from_path: tp.Optional[Path] = None
+    checkpoint_source: tp.Optional[checkpoint.CheckpointSource] = None
+
+    if load_best:
+        self.logger.info(&#34;Trying to load state_dict from best state.&#34;)
+
+    state: tp.Optional[dict] = None
+    rank0_checkpoint_path = self.checkpoint_path(use_fsdp=False)
+    current_checkpoint_path = self.checkpoint_path()
+    _pretrained_prefix = &#39;//pretrained/&#39;
+    continue_pretrained = (self.cfg.continue_from or &#39;&#39;).startswith(_pretrained_prefix)
+    if rank0_checkpoint_path.exists():
+        self.logger.info(f&#34;Loading existing checkpoint: {current_checkpoint_path}&#34;)
+        load_from_path = current_checkpoint_path
+        checkpoint.check_sharded_checkpoint(current_checkpoint_path, rank0_checkpoint_path)
+        checkpoint_source = checkpoint.CheckpointSource.CURRENT_XP
+    elif self.cfg.continue_from and not continue_pretrained:
+        self.logger.info(f&#34;Continuing from provided checkpoint: {self.cfg.continue_from}&#34;)
+        # we&#39;re always continuing from consolidated checkpoints: self.cfg.use_fsdp and not continue_best
+        load_from_path = checkpoint.resolve_checkpoint_path(self.cfg.continue_from, use_fsdp=False)
+        if load_from_path is None:
+            self.logger.error(&#39;Could not resolve the continue_from checkpoint %s&#39;, self.cfg.continue_from)
+            raise RuntimeError(f&#39;Could not resolve continue_from checkpoint {self.cfg.continue_from}&#39;)
+        checkpoint_source = checkpoint.CheckpointSource.OTHER
+
+    if load_from_path is not None:
+        state = checkpoint.load_checkpoint(load_from_path, is_sharded)
+    elif continue_pretrained:
+        self.logger.info(&#34;Loading a pretrained model. Ignoring &#39;load_best&#39; and &#39;ignore_state_keys&#39; params.&#34;)
+        state = self.load_from_pretrained(self.cfg.continue_from[len(_pretrained_prefix):])
+        checkpoint_source = checkpoint.CheckpointSource.PRETRAINED
+        load_best = True
+
+    # checkpoints are not from the current xp, we only retrieve the best state
+    if checkpoint_source is not None and checkpoint_source != checkpoint.CheckpointSource.CURRENT_XP:
+        assert state is not None
+        self.logger.info(&#34;Checkpoint source is not the current xp: Load state_dict from best state.&#34;)
+        load_best = True
+        state = {key: state[key] for key in self._continue_best_source_keys if key in state}
+        # loaded checkpoints are FSDP checkpoints: we&#39;re reading the best state
+        # from FSDP and we drop the regular best_state
+        if &#39;fsdp_best_state&#39; in state and state[&#39;fsdp_best_state&#39;]:
+            state.pop(&#39;best_state&#39;, None)
+            self.logger.info(&#34;... Loaded checkpoint has FSDP best state&#34;)
+        # FSDP is enabled in the solver, if the loaded checkpoints do not have FSDP support
+        # then we&#39;re initializing FSDP best state with the regular best state
+        elif self.cfg.fsdp.use:
+            if &#39;fsdp_best_state&#39; not in state or not state[&#39;fsdp_best_state&#39;]:
+                # we swap non-FSDP checkpoints best_state to FSDP-compatible best state
+                state[&#39;fsdp_best_state&#39;] = state.pop(&#39;best_state&#39;)
+                self.logger.info(&#34;... Loaded checkpoint does not have FSDP best state. Use regular best state&#34;)
+
+    if state is not None:
+        if load_best:
+            self.logger.info(&#34;Ignoring keys when loading best %r&#34;, ignore_state_keys)
+            for key in set(ignore_state_keys):
+                if key in state:
+                    state.pop(key)
+            has_best_state = &#39;best_state&#39; in state or &#39;fsdp_best_state&#39; in state
+            assert has_best_state, (&#34;Trying to load best state but neither &#39;best_state&#39;&#34;,
+                                    &#34; or &#39;fsdp_best_state&#39; found in checkpoints.&#34;)
+        self.load_state_dict(state)
+
+    # for FSDP, let&#39;s make extra sure nothing bad happened with out of sync
+    # checkpoints across workers.
+    epoch = float(self.epoch)
+    avg_epoch = flashy.distrib.average_metrics({&#39;epoch&#39;: epoch})[&#39;epoch&#39;]
+    if avg_epoch != epoch:
+        raise RuntimeError(
+            f&#34;Inconsistent loading of checkpoints happened, our epoch is {epoch} &#34;
+            f&#34;but average of epochs is {avg_epoch}, at least one gpu must have a &#34;
+            &#34;different epoch number.&#34;)
+
+    # on load_best, properly reinitialize state_dict, best states and ema
+    # otherwise we load from the current xp and don&#39;t alter anything
+    if load_best:
+        self.logger.info(&#34;Loading state_dict from best state.&#34;)
+        if not self.cfg.fsdp.use and self.fsdp_best_state:
+            # loading from an FSDP checkpoint but with FSDP deactivated
+            self.logger.info(&#34;... Loading from FSDP best state dict.&#34;)
+            self.best_state.load_state_dict(self.fsdp_best_state)
+
+        # if load_best, we permanently override the regular state_dict with the best state
+        if self.cfg.fsdp.use:
+            self.logger.info(&#34;FSDP is used, loading from FSDP best state.&#34;)
+            with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                # this might be really fragile but okay for now.
+                self.load_state_dict(self.fsdp_best_state)
+        else:
+            # we permanently swap the stateful objects to their best state
+            self._load_new_state_dict(self.best_state.state_dict())
+
+        # the EMA modules should also be instantiated with best state.
+        # the easiest way to do so is to reinitialize a new EMA with best state loaded.
+        if self.ema is not None:
+            self.logger.info(&#34;Re-initializing EMA from best state&#34;)
+            self.initialize_ema()
+
+        if self.cfg.fsdp.use:
+            self.logger.info(&#34;Re-initializing best state after using FSDP best state.&#34;)
+            for name in self.best_state.states.keys():
+                state_source = self._get_state_source(name)
+                self.best_state.update(name, state_source)
+
+    return state</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.load_from_pretrained"><code class="name flex">
+<span>def <span class="ident">load_from_pretrained</span></span>(<span>self, name: str) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_from_pretrained(self, name: str) -&gt; dict:
+    raise NotImplementedError(&#34;Solver does not provide a way to load pretrained models.&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.log_model_summary"><code class="name flex">
+<span>def <span class="ident">log_model_summary</span></span>(<span>self, model: torch.nn.modules.module.Module)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Log model summary, architecture and size of the model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def log_model_summary(self, model: nn.Module):
+    &#34;&#34;&#34;Log model summary, architecture and size of the model.&#34;&#34;&#34;
+    self.logger.info(model)
+    mb = sum(p.numel() for p in model.parameters()) * 4 / 2 ** 20
+    self.logger.info(&#34;Size: %.1f MB&#34;, mb)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.register_best_state"><code class="name flex">
+<span>def <span class="ident">register_best_state</span></span>(<span>self, *args: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Register state sources in <code>BestStateDictManager</code> to keep their best states along with their
+latest states. The best state will be used at evaluation stages instead of the latest states.</p>
+<p>Shortcut around <code>BestStateDictManager.register</code> method. You can pass any number of
+attribute, included nested attributes and those will be included into the checkpoints
+and automatically restored when <code>BaseSolver.restore</code> is called.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def register_best_state(self, *args: str):
+    &#34;&#34;&#34;Register state sources in `BestStateDictManager` to keep their best states along with their
+    latest states. The best state will be used at evaluation stages instead of the latest states.
+
+    Shortcut around `BestStateDictManager.register` method. You can pass any number of
+    attribute, included nested attributes and those will be included into the checkpoints
+    and automatically restored when `BaseSolver.restore` is called.
+    &#34;&#34;&#34;
+    for name in args:
+        state_source = self._get_state_source(name)
+        assert name in self.stateful.sources, &#34;Registered states in best should be registered in stateful first!&#34;
+        self.best_state.register(name, state_source)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.register_ema"><code class="name flex">
+<span>def <span class="ident">register_ema</span></span>(<span>self, *args: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Register state sources for exponential moving average.</p>
+<p>The registered sources are used to instantiate a ModuleDictEMA instance.
+The ModuleDictEMA keeps a <code>nn.ModuleDict</code> module that is updated when self.ema.step() is called
+and swapped with the original state sources with self.swap_ema_state() method.</p>
+<h2 id="usage">Usage</h2>
+<p>self.register_ema('model')</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def register_ema(self, *args: str):
+    &#34;&#34;&#34;Register state sources for exponential moving average.
+
+    The registered sources are used to instantiate a ModuleDictEMA instance.
+    The ModuleDictEMA keeps a `nn.ModuleDict` module that is updated when self.ema.step() is called
+    and swapped with the original state sources with self.swap_ema_state() method.
+
+    Usage:
+        self.register_ema(&#39;model&#39;)
+    &#34;&#34;&#34;
+    assert self.ema is None, &#34;Cannot register state source to already instantiated EMA.&#34;
+    for name in args:
+        self._ema_sources[name] = getattr(self, name)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.restore"><code class="name flex">
+<span>def <span class="ident">restore</span></span>(<span>self, load_best: bool = False, replay_metrics: bool = False, ignore_state_keys: List[str] = []) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Restore the status of a solver for a given xp.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>load_best</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code>, load the best state from the checkpoint.</dd>
+<dt><strong><code>replay_metrics</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code>, logs all the metrics from past epochs.</dd>
+<dt><strong><code>ignore_state_keys</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>list of sources to ignore when loading the state, e.g. <code>optimizer</code>.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def restore(self, load_best: bool = False, replay_metrics: bool = False,
+            ignore_state_keys: tp.List[str] = []) -&gt; bool:
+    &#34;&#34;&#34;Restore the status of a solver for a given xp.
+
+    Args:
+        load_best (bool): if `True`, load the best state from the checkpoint.
+        replay_metrics (bool): if `True`, logs all the metrics from past epochs.
+        ignore_state_keys (list of str): list of sources to ignore when loading the state, e.g. `optimizer`.
+    &#34;&#34;&#34;
+    self.logger.info(&#34;Restoring weights and history.&#34;)
+    restored_checkpoints = self.load_checkpoints(load_best, ignore_state_keys)
+
+    self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+
+    if replay_metrics and len(self.history) &gt; 0:
+        self.logger.info(&#34;Replaying past metrics...&#34;)
+        for epoch, stages in enumerate(self.history):
+            for stage_name, metrics in stages.items():
+                # We manually log the metrics summary to the result logger
+                # as we don&#39;t want to add them to the pending metrics
+                self.result_logger._log_summary(stage_name, metrics, step=epoch + 1, step_name=&#39;epoch&#39;,
+                                                formatter=self.get_formatter(stage_name))
+    return restored_checkpoints is not None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.run"><code class="name flex">
+<span>def <span class="ident">run</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Training loop.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def run(self):
+    &#34;&#34;&#34;Training loop.&#34;&#34;&#34;
+    assert len(self.state_dict()) &gt; 0
+    self.restore(replay_metrics=True)  # load checkpoint and replay history
+    self.log_hyperparams(dict_from_config(self.cfg))
+    for epoch in range(self.epoch, self.cfg.optim.epochs + 1):
+        if self.should_stop_training():
+            return
+        self.run_epoch()
+        # Commit will send the metrics to Dora and save checkpoints by default.
+        self.commit()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.run_epoch"><code class="name flex">
+<span>def <span class="ident">run_epoch</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Run a single epoch with all stages.</p>
+<p>Metrics for a given stage are stored in _pending_metrics and committed by the solver afterwards.
+Children solvers can extend this method with custom behavior, e.g.:</p>
+<pre><code>def run_epoch(self):
+    ... # custom code
+    super().run_epoch()
+    ... # custom code
+</code></pre></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def run_epoch(self):
+    &#34;&#34;&#34;Run a single epoch with all stages.
+
+    Metrics for a given stage are stored in _pending_metrics and committed by the solver afterwards.
+    Children solvers can extend this method with custom behavior, e.g.:
+
+        def run_epoch(self):
+            ... # custom code
+            super().run_epoch()
+            ... # custom code
+    &#34;&#34;&#34;
+    self.run_stage(&#39;train&#39;, self.train)
+    with torch.no_grad():
+        with self.swap_ema_state():
+            self.run_stage(&#39;valid&#39;, self.valid)
+            # the best state is updated with EMA states if available
+            self.update_best_state_from_stage(&#39;valid&#39;)
+        with self.swap_best_state():
+            if self.should_run_stage(&#39;evaluate&#39;):
+                self.run_stage(&#39;evaluate&#39;, self.evaluate)
+            if self.should_run_stage(&#39;generate&#39;):
+                self.run_stage(&#39;generate&#39;, with_rank_rng()(self.generate))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.run_one_stage"><code class="name flex">
+<span>def <span class="ident">run_one_stage</span></span>(<span>self, stage_name: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Run only the specified stage.
+This method is useful to only generate samples from a trained experiment
+or rerun the validation or evaluation stages.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def run_one_stage(self, stage_name: str):
+    &#34;&#34;&#34;Run only the specified stage.
+    This method is useful to only generate samples from a trained experiment
+    or rerun the validation or evaluation stages.
+    &#34;&#34;&#34;
+    fn = {
+        &#39;generate&#39;: with_rank_rng()(self.generate),
+        &#39;evaluate&#39;: self.evaluate,
+        &#39;valid&#39;: self.valid,
+    }
+    if stage_name not in fn:
+        raise ValueError(f&#39;Trying to run stage {stage_name} is not supported.&#39;)
+    assert len(self.state_dict()) &gt; 0
+    self._start_epoch()
+    with torch.no_grad(), self.swap_best_state():
+        self.run_stage(stage_name, fn[stage_name])
+    if not self.cfg.execute_inplace:
+        self.commit(save_checkpoints=False)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.run_step"><code class="name flex">
+<span>def <span class="ident">run_step</span></span>(<span>self, idx: int, batch: Any, metrics: dict)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Perform one training or valid step on a given batch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def run_step(self, idx: int, batch: tp.Any, metrics: dict):
+    &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.save_checkpoints"><code class="name flex">
+<span>def <span class="ident">save_checkpoints</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Save checkpoint, optionally keeping a copy for a given epoch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def save_checkpoints(self):
+    &#34;&#34;&#34;Save checkpoint, optionally keeping a copy for a given epoch.&#34;&#34;&#34;
+    is_sharded = self.cfg.fsdp.use
+    if not flashy.distrib.is_rank_zero() and not is_sharded:
+        return
+    self.logger.info(&#34;Model hash: %s&#34;, model_hash(self.model))
+    state = self.state_dict()
+    epoch = self.epoch - 1  # pushing metrics will increase the epoch in Flashy, so we do -1 here
+
+    # save minimal state_dict as new checkpoint every X epoch
+    if self.cfg.checkpoint.save_every:
+        if epoch % self.cfg.checkpoint.save_every == 0:
+            minimal_state = state
+            if self.cfg.checkpoint.keep_every_states is not None and len(self.cfg.checkpoint.keep_every_states) &gt; 0:
+                minimal_state = {
+                    name: source for name, source in state.items()
+                    if name in self.cfg.checkpoint.keep_every_states
+                }
+            epoch_checkpoint_path = self.epoch_checkpoint_path(epoch)
+            checkpoint.save_checkpoint(minimal_state, epoch_checkpoint_path, is_sharded)
+
+    # save checkpoint as latest checkpoint
+    if self.cfg.checkpoint.save_last:
+        last_checkpoint_path = self.checkpoint_path()
+        checkpoint.save_checkpoint(state, last_checkpoint_path, is_sharded)
+
+    # flush any stale checkpoint to reduce disk footprint
+    checkpoint.flush_stale_checkpoints(self.checkpoint_path())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.should_run_stage"><code class="name flex">
+<span>def <span class="ident">should_run_stage</span></span>(<span>self, stage_name) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Check whether we want to run the specified stages.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def should_run_stage(self, stage_name) -&gt; bool:
+    &#34;&#34;&#34;Check whether we want to run the specified stages.&#34;&#34;&#34;
+    stage_every = self.cfg[stage_name].get(&#39;every&#39;, None)
+    is_last_epoch = self.epoch == self.cfg.optim.epochs
+    is_epoch_every = (stage_every and self.epoch % stage_every == 0)
+    return is_last_epoch or is_epoch_every</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.should_stop_training"><code class="name flex">
+<span>def <span class="ident">should_stop_training</span></span>(<span>self) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Check whether we should stop training or not.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def should_stop_training(self) -&gt; bool:
+    &#34;&#34;&#34;Check whether we should stop training or not.&#34;&#34;&#34;
+    return self.epoch &gt; self.cfg.optim.epochs</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.show"><code class="name flex">
+<span>def <span class="ident">show</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Method to log any information without running the job.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def show(self):
+    &#34;&#34;&#34;Method to log any information without running the job.&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.swap_best_state"><code class="name flex">
+<span>def <span class="ident">swap_best_state</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def swap_best_state(self):
+    self.logger.debug(f&#34;Swapping to best state for: {&#39;, &#39;.join(self.best_state.state_dict().keys())}&#34;)
+    old_states = self._load_new_state_dict(self.best_state.state_dict())
+    try:
+        yield
+    finally:
+        self.logger.debug(&#34;Swapping back from best to original state&#34;)
+        for name, old_state in old_states.items():
+            state_source = self._get_state_source(name)
+            state_source.load_state_dict(old_state)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.swap_ema_state"><code class="name flex">
+<span>def <span class="ident">swap_ema_state</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def swap_ema_state(self):
+    if self.ema is None:
+        yield
+    else:
+        ema_state_dict = self.ema.state_dict()[&#39;state&#39;]
+        self.logger.debug(f&#34;Swapping to EMA state for: {&#39;, &#39;.join(ema_state_dict.keys())}&#34;)
+        old_states = self._load_new_state_dict(ema_state_dict)
+        try:
+            yield
+        finally:
+            self.logger.debug(&#34;Swapping back from EMA state to original state&#34;)
+            for name, old_state in old_states.items():
+                state_source = self._get_state_source(name)
+                state_source.load_state_dict(old_state)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.train"><code class="name flex">
+<span>def <span class="ident">train</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Train stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def train(self):
+    &#34;&#34;&#34;Train stage.&#34;&#34;&#34;
+    return self.common_train_valid(&#39;train&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.update_best_state_from_stage"><code class="name flex">
+<span>def <span class="ident">update_best_state_from_stage</span></span>(<span>self, stage_name: str = 'valid')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Update latest best state based on pending metrics of a given stage. This method relies
+on the <code>BestStateDictManager.update</code> method to update the best state_dict with latest weights
+if the registered states happen to match to the best performing setup.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update_best_state_from_stage(self, stage_name: str = &#39;valid&#39;):
+    &#34;&#34;&#34;Update latest best state based on pending metrics of a given stage. This method relies
+    on the `BestStateDictManager.update` method to update the best state_dict with latest weights
+    if the registered states happen to match to the best performing setup.
+    &#34;&#34;&#34;
+    if self.best_metric_name is None:
+        # when no best metric is defined, the last state is always the best
+        self._new_best_state = True
+        self.logger.info(&#34;Updating best state with current state.&#34;)
+    else:
+        assert stage_name in self._pending_metrics, f&#34;Metrics for stage {stage_name} not found.&#34;
+        assert self.best_metric_name in self._pending_metrics[stage_name], \
+            f&#34;Best metric not found in {stage_name} metrics. Cannot register best state&#34;
+        current_score = self._pending_metrics[stage_name][self.best_metric_name]
+        all_best_metric_scores = [
+            past_metrics[stage_name][self.best_metric_name]
+            for past_metrics in self.history
+        ]
+        all_best_metric_scores.append(current_score)
+        best_score = min(all_best_metric_scores)
+        self._new_best_state = current_score == best_score
+        if self._new_best_state:
+            old_best = min(all_best_metric_scores[:-1] + [float(&#39;inf&#39;)])
+            self.logger.info(
+                f&#34;New best state with {self.best_metric_name}={current_score:.3f} (was {old_best:.3f})&#34;)
+
+    if self._new_best_state:
+        if self.cfg.fsdp.use:
+            # this will give an empty state dict on all ranks but the rank 0
+            # which will have a copy in memory of the full model.
+            with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                for name in self.best_state.states.keys():
+                    state_source = self._get_state_source(name)
+                    self.best_state.update(name, state_source)
+                # we save to a different dict.
+                self.fsdp_best_state.update(self.best_state.state_dict())
+            # We cannot efficiently load fsdp_best_state when using FSDP,
+            # so we have do do a second pass, with the local shards.
+        for name in self.best_state.states.keys():
+            state_source = self._get_state_source(name)
+            self.best_state.update(name, state_source)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.valid"><code class="name flex">
+<span>def <span class="ident">valid</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Valid stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def valid(self):
+    &#34;&#34;&#34;Valid stage.&#34;&#34;&#34;
+    return self.common_train_valid(&#39;valid&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.base.StandardSolver.wrap_with_fsdp"><code class="name flex">
+<span>def <span class="ident">wrap_with_fsdp</span></span>(<span>self, model: torch.nn.modules.module.Module, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def wrap_with_fsdp(self, model: torch.nn.Module, *args, **kwargs):
+    model = fsdp.wrap_with_fsdp(self.cfg.fsdp, model, *args, **kwargs)
+    if isinstance(model, fsdp.FSDP):
+        self._fsdp_modules.append(model)
+    return model</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.base.StandardSolver" href="#audiocraft.solvers.base.StandardSolver">StandardSolver</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.base.StandardSolver.autocast" href="#audiocraft.solvers.base.StandardSolver.autocast">autocast</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.best_metric_name" href="#audiocraft.solvers.base.StandardSolver.best_metric_name">best_metric_name</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.build_dataloaders" href="#audiocraft.solvers.base.StandardSolver.build_dataloaders">build_dataloaders</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.build_model" href="#audiocraft.solvers.base.StandardSolver.build_model">build_model</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.checkpoint_path" href="#audiocraft.solvers.base.StandardSolver.checkpoint_path">checkpoint_path</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.checkpoint_path_with_name" href="#audiocraft.solvers.base.StandardSolver.checkpoint_path_with_name">checkpoint_path_with_name</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.commit" href="#audiocraft.solvers.base.StandardSolver.commit">commit</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.common_train_valid" href="#audiocraft.solvers.base.StandardSolver.common_train_valid">common_train_valid</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.epoch_checkpoint_path" href="#audiocraft.solvers.base.StandardSolver.epoch_checkpoint_path">epoch_checkpoint_path</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.evaluate" href="#audiocraft.solvers.base.StandardSolver.evaluate">evaluate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.generate" href="#audiocraft.solvers.base.StandardSolver.generate">generate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig" href="#audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig">get_eval_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.initialize_ema" href="#audiocraft.solvers.base.StandardSolver.initialize_ema">initialize_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.is_training" href="#audiocraft.solvers.base.StandardSolver.is_training">is_training</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.load_checkpoints" href="#audiocraft.solvers.base.StandardSolver.load_checkpoints">load_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.load_from_pretrained" href="#audiocraft.solvers.base.StandardSolver.load_from_pretrained">load_from_pretrained</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.log_model_summary" href="#audiocraft.solvers.base.StandardSolver.log_model_summary">log_model_summary</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.log_updates" href="#audiocraft.solvers.base.StandardSolver.log_updates">log_updates</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_best_state" href="#audiocraft.solvers.base.StandardSolver.register_best_state">register_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_ema" href="#audiocraft.solvers.base.StandardSolver.register_ema">register_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.restore" href="#audiocraft.solvers.base.StandardSolver.restore">restore</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run" href="#audiocraft.solvers.base.StandardSolver.run">run</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_epoch" href="#audiocraft.solvers.base.StandardSolver.run_epoch">run_epoch</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_one_stage" href="#audiocraft.solvers.base.StandardSolver.run_one_stage">run_one_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_step" href="#audiocraft.solvers.base.StandardSolver.run_step">run_step</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.save_checkpoints" href="#audiocraft.solvers.base.StandardSolver.save_checkpoints">save_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_run_stage" href="#audiocraft.solvers.base.StandardSolver.should_run_stage">should_run_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_stop_training" href="#audiocraft.solvers.base.StandardSolver.should_stop_training">should_stop_training</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.show" href="#audiocraft.solvers.base.StandardSolver.show">show</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.swap_best_state" href="#audiocraft.solvers.base.StandardSolver.swap_best_state">swap_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.swap_ema_state" href="#audiocraft.solvers.base.StandardSolver.swap_ema_state">swap_ema_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.train" href="#audiocraft.solvers.base.StandardSolver.train">train</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.update_best_state_from_stage" href="#audiocraft.solvers.base.StandardSolver.update_best_state_from_stage">update_best_state_from_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.valid" href="#audiocraft.solvers.base.StandardSolver.valid">valid</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.wrap_with_fsdp" href="#audiocraft.solvers.base.StandardSolver.wrap_with_fsdp">wrap_with_fsdp</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/builders.html b/api_docs/audiocraft/solvers/builders.html
new file mode 100644
index 00000000..28826cd2
--- /dev/null
+++ b/api_docs/audiocraft/solvers/builders.html
@@ -0,0 +1,1009 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.builders API documentation</title>
+<meta name="description" content="All the functions to build the relevant solvers and used objects
+from the Hydra config." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.builders</code></h1>
+</header>
+<section id="section-intro">
+<p>All the functions to build the relevant solvers and used objects
+from the Hydra config.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+All the functions to build the relevant solvers and used objects
+from the Hydra config.
+&#34;&#34;&#34;
+
+from enum import Enum
+import logging
+import typing as tp
+
+import dora
+import flashy
+import omegaconf
+import torch
+from torch import nn
+from torch.optim import Optimizer
+# LRScheduler was renamed in some torch versions
+try:
+    from torch.optim.lr_scheduler import LRScheduler  # type: ignore
+except ImportError:
+    from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+
+from .base import StandardSolver
+from .. import adversarial, data, losses, metrics, optim
+from ..utils.utils import dict_from_config, get_loader
+
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetType(Enum):
+    AUDIO = &#34;audio&#34;
+    MUSIC = &#34;music&#34;
+    SOUND = &#34;sound&#34;
+
+
+def get_solver(cfg: omegaconf.DictConfig) -&gt; StandardSolver:
+    &#34;&#34;&#34;Instantiate solver from config.&#34;&#34;&#34;
+    from .audiogen import AudioGenSolver
+    from .compression import CompressionSolver
+    from .musicgen import MusicGenSolver
+    from .diffusion import DiffusionSolver
+    klass = {
+        &#39;compression&#39;: CompressionSolver,
+        &#39;musicgen&#39;: MusicGenSolver,
+        &#39;audiogen&#39;: AudioGenSolver,
+        &#39;lm&#39;: MusicGenSolver,  # backward compatibility
+        &#39;diffusion&#39;: DiffusionSolver,
+        &#39;sound_lm&#39;: AudioGenSolver,  # backward compatibility
+    }[cfg.solver]
+    return klass(cfg)  # type: ignore
+
+
+def get_optim_parameter_groups(model: nn.Module):
+    &#34;&#34;&#34;Create parameter groups for the model using the appropriate method
+    if defined for each modules, to create the different groups.
+
+    Args:
+        model (nn.Module): torch model
+    Returns:
+        List of parameter groups
+    &#34;&#34;&#34;
+    seen_params: tp.Set[nn.parameter.Parameter] = set()
+    other_params = []
+    groups = []
+    for name, module in model.named_modules():
+        if hasattr(module, &#39;make_optim_group&#39;):
+            group = module.make_optim_group()
+            params = set(group[&#39;params&#39;])
+            assert params.isdisjoint(seen_params)
+            seen_params |= set(params)
+            groups.append(group)
+    for param in model.parameters():
+        if param not in seen_params:
+            other_params.append(param)
+    groups.insert(0, {&#39;params&#39;: other_params})
+    parameters = groups
+    return parameters
+
+
+def get_optimizer(params: tp.Union[nn.Module, tp.Iterable[torch.Tensor]], cfg: omegaconf.DictConfig) -&gt; Optimizer:
+    &#34;&#34;&#34;Build torch optimizer from config and set of parameters.
+    Supported optimizers: Adam, AdamW
+
+    Args:
+        params (nn.Module or iterable of torch.Tensor): Parameters to optimize.
+        cfg (DictConfig): Optimization-related configuration.
+    Returns:
+        torch.optim.Optimizer.
+    &#34;&#34;&#34;
+    if &#39;optimizer&#39; not in cfg:
+        if getattr(cfg, &#39;optim&#39;, None) is not None:
+            raise KeyError(&#34;Optimizer not found in config. Try instantiating optimizer from cfg.optim?&#34;)
+        else:
+            raise KeyError(&#34;Optimizer not found in config.&#34;)
+
+    parameters = get_optim_parameter_groups(params) if isinstance(params, nn.Module) else params
+    optimizer: torch.optim.Optimizer
+    if cfg.optimizer == &#39;adam&#39;:
+        optimizer = torch.optim.Adam(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == &#39;adamw&#39;:
+        optimizer = torch.optim.AdamW(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == &#39;dadam&#39;:
+        optimizer = optim.DAdaptAdam(parameters, lr=cfg.lr, **cfg.adam)
+    else:
+        raise ValueError(f&#34;Unsupported LR Scheduler: {cfg.lr_scheduler}&#34;)
+    return optimizer
+
+
+def get_lr_scheduler(optimizer: torch.optim.Optimizer,
+                     cfg: omegaconf.DictConfig,
+                     total_updates: int) -&gt; tp.Optional[LRScheduler]:
+    &#34;&#34;&#34;Build torch learning rate scheduler from config and associated optimizer.
+    Supported learning rate schedulers: ExponentialLRScheduler, PlateauLRScheduler
+
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer.
+        cfg (DictConfig): Schedule-related configuration.
+        total_updates (int): Total number of updates.
+    Returns:
+        torch.optim.Optimizer.
+    &#34;&#34;&#34;
+    if &#39;lr_scheduler&#39; not in cfg:
+        raise KeyError(&#34;LR Scheduler not found in config&#34;)
+
+    lr_sched: tp.Optional[LRScheduler] = None
+    if cfg.lr_scheduler == &#39;step&#39;:
+        lr_sched = torch.optim.lr_scheduler.StepLR(optimizer, **cfg.step)
+    elif cfg.lr_scheduler == &#39;exponential&#39;:
+        lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.exponential)
+    elif cfg.lr_scheduler == &#39;cosine&#39;:
+        kwargs = dict_from_config(cfg.cosine)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.CosineLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == &#39;polynomial_decay&#39;:
+        kwargs = dict_from_config(cfg.polynomial_decay)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.PolynomialDecayLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == &#39;inverse_sqrt&#39;:
+        kwargs = dict_from_config(cfg.inverse_sqrt)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.InverseSquareRootLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler == &#39;linear_warmup&#39;:
+        kwargs = dict_from_config(cfg.linear_warmup)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.LinearWarmupLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler is not None:
+        raise ValueError(f&#34;Unsupported LR Scheduler: {cfg.lr_scheduler}&#34;)
+    return lr_sched
+
+
+def get_ema(module_dict: nn.ModuleDict, cfg: omegaconf.DictConfig) -&gt; tp.Optional[optim.ModuleDictEMA]:
+    &#34;&#34;&#34;Initialize Exponential Moving Average.
+
+    Args:
+        module_dict (nn.ModuleDict): ModuleDict for which to compute the EMA.
+        cfg (omegaconf.DictConfig): Optim EMA configuration.
+    Returns:
+        optim.ModuleDictEMA: EMA version of the ModuleDict.
+    &#34;&#34;&#34;
+    kw: tp.Dict[str, tp.Any] = dict(cfg)
+    use = kw.pop(&#39;use&#39;, False)
+    decay = kw.pop(&#39;decay&#39;, None)
+    device = kw.pop(&#39;device&#39;, None)
+    if not use:
+        return None
+    if len(module_dict) == 0:
+        raise ValueError(&#34;Trying to build EMA but an empty module_dict source is provided!&#34;)
+    ema_module = optim.ModuleDictEMA(module_dict, decay=decay, device=device)
+    return ema_module
+
+
+def get_loss(loss_name: str, cfg: omegaconf.DictConfig):
+    &#34;&#34;&#34;Instantiate loss from configuration.&#34;&#34;&#34;
+    klass = {
+        &#39;l1&#39;: torch.nn.L1Loss,
+        &#39;l2&#39;: torch.nn.MSELoss,
+        &#39;mel&#39;: losses.MelSpectrogramL1Loss,
+        &#39;mrstft&#39;: losses.MRSTFTLoss,
+        &#39;msspec&#39;: losses.MultiScaleMelSpectrogramLoss,
+        &#39;sisnr&#39;: losses.SISNR,
+    }[loss_name]
+    kwargs = dict(getattr(cfg, loss_name))
+    return klass(**kwargs)
+
+
+def get_balancer(loss_weights: tp.Dict[str, float], cfg: omegaconf.DictConfig) -&gt; losses.Balancer:
+    &#34;&#34;&#34;Instantiate loss balancer from configuration for the provided weights.&#34;&#34;&#34;
+    kwargs: tp.Dict[str, tp.Any] = dict_from_config(cfg)
+    return losses.Balancer(loss_weights, **kwargs)
+
+
+def get_adversary(name: str, cfg: omegaconf.DictConfig) -&gt; nn.Module:
+    &#34;&#34;&#34;Initialize adversary from config.&#34;&#34;&#34;
+    klass = {
+        &#39;msd&#39;: adversarial.MultiScaleDiscriminator,
+        &#39;mpd&#39;: adversarial.MultiPeriodDiscriminator,
+        &#39;msstftd&#39;: adversarial.MultiScaleSTFTDiscriminator,
+    }[name]
+    adv_cfg: tp.Dict[str, tp.Any] = dict(getattr(cfg, name))
+    return klass(**adv_cfg)
+
+
+def get_adversarial_losses(cfg) -&gt; nn.ModuleDict:
+    &#34;&#34;&#34;Initialize dict of adversarial losses from config.&#34;&#34;&#34;
+    device = cfg.device
+    adv_cfg = getattr(cfg, &#39;adversarial&#39;)
+    adversaries = adv_cfg.get(&#39;adversaries&#39;, [])
+    adv_loss_name = adv_cfg[&#39;adv_loss&#39;]
+    feat_loss_name = adv_cfg.get(&#39;feat_loss&#39;)
+    normalize = adv_cfg.get(&#39;normalize&#39;, True)
+    feat_loss: tp.Optional[adversarial.FeatureMatchingLoss] = None
+    if feat_loss_name:
+        assert feat_loss_name in [&#39;l1&#39;, &#39;l2&#39;], f&#34;Feature loss only support L1 or L2 but {feat_loss_name} found.&#34;
+        loss = get_loss(feat_loss_name, cfg)
+        feat_loss = adversarial.FeatureMatchingLoss(loss, normalize)
+    loss = adversarial.get_adv_criterion(adv_loss_name)
+    loss_real = adversarial.get_real_criterion(adv_loss_name)
+    loss_fake = adversarial.get_fake_criterion(adv_loss_name)
+    adv_losses = nn.ModuleDict()
+    for adv_name in adversaries:
+        adversary = get_adversary(adv_name, cfg).to(device)
+        optimizer = get_optimizer(adversary.parameters(), cfg.optim)
+        adv_loss = adversarial.AdversarialLoss(
+            adversary,
+            optimizer,
+            loss=loss,
+            loss_real=loss_real,
+            loss_fake=loss_fake,
+            loss_feat=feat_loss,
+            normalize=normalize
+        )
+        adv_losses[adv_name] = adv_loss
+    return adv_losses
+
+
+def get_visqol(cfg: omegaconf.DictConfig) -&gt; metrics.ViSQOL:
+    &#34;&#34;&#34;Instantiate ViSQOL metric from config.&#34;&#34;&#34;
+    kwargs = dict_from_config(cfg)
+    return metrics.ViSQOL(**kwargs)
+
+
+def get_fad(cfg: omegaconf.DictConfig) -&gt; metrics.FrechetAudioDistanceMetric:
+    &#34;&#34;&#34;Instantiate Frechet Audio Distance metric from config.&#34;&#34;&#34;
+    kwargs = dict_from_config(cfg.tf)
+    xp = dora.get_xp()
+    kwargs[&#39;log_folder&#39;] = xp.folder
+    return metrics.FrechetAudioDistanceMetric(**kwargs)
+
+
+def get_kldiv(cfg: omegaconf.DictConfig) -&gt; metrics.KLDivergenceMetric:
+    &#34;&#34;&#34;Instantiate KL-Divergence metric from config.&#34;&#34;&#34;
+    kld_metrics = {
+        &#39;passt&#39;: metrics.PasstKLDivergenceMetric,
+    }
+    klass = kld_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)
+
+
+def get_text_consistency(cfg: omegaconf.DictConfig) -&gt; metrics.TextConsistencyMetric:
+    &#34;&#34;&#34;Instantiate Text Consistency metric from config.&#34;&#34;&#34;
+    text_consistency_metrics = {
+        &#39;clap&#39;: metrics.CLAPTextConsistencyMetric
+    }
+    klass = text_consistency_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)
+
+
+def get_chroma_cosine_similarity(cfg: omegaconf.DictConfig) -&gt; metrics.ChromaCosineSimilarityMetric:
+    &#34;&#34;&#34;Instantiate Chroma Cosine Similarity metric from config.&#34;&#34;&#34;
+    assert cfg.model == &#39;chroma_base&#39;, &#34;Only support &#39;chroma_base&#39; method for chroma cosine similarity metric&#34;
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return metrics.ChromaCosineSimilarityMetric(**kwargs)
+
+
+def get_audio_datasets(cfg: omegaconf.DictConfig,
+                       dataset_type: DatasetType = DatasetType.AUDIO) -&gt; tp.Dict[str, torch.utils.data.DataLoader]:
+    &#34;&#34;&#34;Build AudioDataset from configuration.
+
+    Args:
+        cfg (omegaconf.DictConfig): Configuration.
+        dataset_type: The type of dataset to create.
+    Returns:
+        dict[str, torch.utils.data.DataLoader]: Map of dataloader for each data split.
+    &#34;&#34;&#34;
+    dataloaders: dict = {}
+
+    sample_rate = cfg.sample_rate
+    channels = cfg.channels
+    seed = cfg.seed
+    max_sample_rate = cfg.datasource.max_sample_rate
+    max_channels = cfg.datasource.max_channels
+
+    assert cfg.dataset is not None, &#34;Could not find dataset definition in config&#34;
+
+    dataset_cfg = dict_from_config(cfg.dataset)
+    splits_cfg: dict = {}
+    splits_cfg[&#39;train&#39;] = dataset_cfg.pop(&#39;train&#39;)
+    splits_cfg[&#39;valid&#39;] = dataset_cfg.pop(&#39;valid&#39;)
+    splits_cfg[&#39;evaluate&#39;] = dataset_cfg.pop(&#39;evaluate&#39;)
+    splits_cfg[&#39;generate&#39;] = dataset_cfg.pop(&#39;generate&#39;)
+    execute_only_stage = cfg.get(&#39;execute_only&#39;, None)
+
+    for split, path in cfg.datasource.items():
+        if not isinstance(path, str):
+            continue  # skipping this as not a path
+        if execute_only_stage is not None and split != execute_only_stage:
+            continue
+        logger.info(f&#34;Loading audio data split {split}: {str(path)}&#34;)
+        assert (
+            cfg.sample_rate &lt;= max_sample_rate
+        ), f&#34;Expecting a max sample rate of {max_sample_rate} for datasource but {sample_rate} found.&#34;
+        assert (
+            cfg.channels &lt;= max_channels
+        ), f&#34;Expecting a max number of channels of {max_channels} for datasource but {channels} found.&#34;
+
+        split_cfg = splits_cfg[split]
+        split_kwargs = {k: v for k, v in split_cfg.items()}
+        kwargs = {**dataset_cfg, **split_kwargs}  # split kwargs overrides default dataset_cfg
+        kwargs[&#39;sample_rate&#39;] = sample_rate
+        kwargs[&#39;channels&#39;] = channels
+
+        if kwargs.get(&#39;permutation_on_files&#39;) and cfg.optim.updates_per_epoch:
+            kwargs[&#39;num_samples&#39;] = (
+                flashy.distrib.world_size() * cfg.dataset.batch_size * cfg.optim.updates_per_epoch)
+
+        num_samples = kwargs[&#39;num_samples&#39;]
+        shuffle = kwargs[&#39;shuffle&#39;]
+
+        return_info = kwargs.pop(&#39;return_info&#39;)
+        batch_size = kwargs.pop(&#39;batch_size&#39;, None)
+        num_workers = kwargs.pop(&#39;num_workers&#39;)
+
+        if dataset_type == DatasetType.MUSIC:
+            dataset = data.music_dataset.MusicDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.SOUND:
+            dataset = data.sound_dataset.SoundDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.AUDIO:
+            dataset = data.info_audio_dataset.InfoAudioDataset.from_meta(path, return_info=return_info, **kwargs)
+        else:
+            raise ValueError(f&#34;Dataset type is unsupported: {dataset_type}&#34;)
+
+        loader = get_loader(
+            dataset,
+            num_samples,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            seed=seed,
+            collate_fn=dataset.collater if return_info else None,
+            shuffle=shuffle,
+        )
+        dataloaders[split] = loader
+
+    return dataloaders</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.solvers.builders.get_adversarial_losses"><code class="name flex">
+<span>def <span class="ident">get_adversarial_losses</span></span>(<span>cfg) ‑> torch.nn.modules.container.ModuleDict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Initialize dict of adversarial losses from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_adversarial_losses(cfg) -&gt; nn.ModuleDict:
+    &#34;&#34;&#34;Initialize dict of adversarial losses from config.&#34;&#34;&#34;
+    device = cfg.device
+    adv_cfg = getattr(cfg, &#39;adversarial&#39;)
+    adversaries = adv_cfg.get(&#39;adversaries&#39;, [])
+    adv_loss_name = adv_cfg[&#39;adv_loss&#39;]
+    feat_loss_name = adv_cfg.get(&#39;feat_loss&#39;)
+    normalize = adv_cfg.get(&#39;normalize&#39;, True)
+    feat_loss: tp.Optional[adversarial.FeatureMatchingLoss] = None
+    if feat_loss_name:
+        assert feat_loss_name in [&#39;l1&#39;, &#39;l2&#39;], f&#34;Feature loss only support L1 or L2 but {feat_loss_name} found.&#34;
+        loss = get_loss(feat_loss_name, cfg)
+        feat_loss = adversarial.FeatureMatchingLoss(loss, normalize)
+    loss = adversarial.get_adv_criterion(adv_loss_name)
+    loss_real = adversarial.get_real_criterion(adv_loss_name)
+    loss_fake = adversarial.get_fake_criterion(adv_loss_name)
+    adv_losses = nn.ModuleDict()
+    for adv_name in adversaries:
+        adversary = get_adversary(adv_name, cfg).to(device)
+        optimizer = get_optimizer(adversary.parameters(), cfg.optim)
+        adv_loss = adversarial.AdversarialLoss(
+            adversary,
+            optimizer,
+            loss=loss,
+            loss_real=loss_real,
+            loss_fake=loss_fake,
+            loss_feat=feat_loss,
+            normalize=normalize
+        )
+        adv_losses[adv_name] = adv_loss
+    return adv_losses</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_adversary"><code class="name flex">
+<span>def <span class="ident">get_adversary</span></span>(<span>name: str, cfg: omegaconf.dictconfig.DictConfig) ‑> torch.nn.modules.module.Module</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Initialize adversary from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_adversary(name: str, cfg: omegaconf.DictConfig) -&gt; nn.Module:
+    &#34;&#34;&#34;Initialize adversary from config.&#34;&#34;&#34;
+    klass = {
+        &#39;msd&#39;: adversarial.MultiScaleDiscriminator,
+        &#39;mpd&#39;: adversarial.MultiPeriodDiscriminator,
+        &#39;msstftd&#39;: adversarial.MultiScaleSTFTDiscriminator,
+    }[name]
+    adv_cfg: tp.Dict[str, tp.Any] = dict(getattr(cfg, name))
+    return klass(**adv_cfg)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_audio_datasets"><code class="name flex">
+<span>def <span class="ident">get_audio_datasets</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig, dataset_type: <a title="audiocraft.solvers.builders.DatasetType" href="#audiocraft.solvers.builders.DatasetType">DatasetType</a> = DatasetType.AUDIO) ‑> Dict[str, torch.utils.data.dataloader.DataLoader]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build AudioDataset from configuration.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>omegaconf.DictConfig</code></dt>
+<dd>Configuration.</dd>
+<dt><strong><code>dataset_type</code></strong></dt>
+<dd>The type of dataset to create.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>dict[str, torch.utils.data.DataLoader]</code></dt>
+<dd>Map of dataloader for each data split.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_audio_datasets(cfg: omegaconf.DictConfig,
+                       dataset_type: DatasetType = DatasetType.AUDIO) -&gt; tp.Dict[str, torch.utils.data.DataLoader]:
+    &#34;&#34;&#34;Build AudioDataset from configuration.
+
+    Args:
+        cfg (omegaconf.DictConfig): Configuration.
+        dataset_type: The type of dataset to create.
+    Returns:
+        dict[str, torch.utils.data.DataLoader]: Map of dataloader for each data split.
+    &#34;&#34;&#34;
+    dataloaders: dict = {}
+
+    sample_rate = cfg.sample_rate
+    channels = cfg.channels
+    seed = cfg.seed
+    max_sample_rate = cfg.datasource.max_sample_rate
+    max_channels = cfg.datasource.max_channels
+
+    assert cfg.dataset is not None, &#34;Could not find dataset definition in config&#34;
+
+    dataset_cfg = dict_from_config(cfg.dataset)
+    splits_cfg: dict = {}
+    splits_cfg[&#39;train&#39;] = dataset_cfg.pop(&#39;train&#39;)
+    splits_cfg[&#39;valid&#39;] = dataset_cfg.pop(&#39;valid&#39;)
+    splits_cfg[&#39;evaluate&#39;] = dataset_cfg.pop(&#39;evaluate&#39;)
+    splits_cfg[&#39;generate&#39;] = dataset_cfg.pop(&#39;generate&#39;)
+    execute_only_stage = cfg.get(&#39;execute_only&#39;, None)
+
+    for split, path in cfg.datasource.items():
+        if not isinstance(path, str):
+            continue  # skipping this as not a path
+        if execute_only_stage is not None and split != execute_only_stage:
+            continue
+        logger.info(f&#34;Loading audio data split {split}: {str(path)}&#34;)
+        assert (
+            cfg.sample_rate &lt;= max_sample_rate
+        ), f&#34;Expecting a max sample rate of {max_sample_rate} for datasource but {sample_rate} found.&#34;
+        assert (
+            cfg.channels &lt;= max_channels
+        ), f&#34;Expecting a max number of channels of {max_channels} for datasource but {channels} found.&#34;
+
+        split_cfg = splits_cfg[split]
+        split_kwargs = {k: v for k, v in split_cfg.items()}
+        kwargs = {**dataset_cfg, **split_kwargs}  # split kwargs overrides default dataset_cfg
+        kwargs[&#39;sample_rate&#39;] = sample_rate
+        kwargs[&#39;channels&#39;] = channels
+
+        if kwargs.get(&#39;permutation_on_files&#39;) and cfg.optim.updates_per_epoch:
+            kwargs[&#39;num_samples&#39;] = (
+                flashy.distrib.world_size() * cfg.dataset.batch_size * cfg.optim.updates_per_epoch)
+
+        num_samples = kwargs[&#39;num_samples&#39;]
+        shuffle = kwargs[&#39;shuffle&#39;]
+
+        return_info = kwargs.pop(&#39;return_info&#39;)
+        batch_size = kwargs.pop(&#39;batch_size&#39;, None)
+        num_workers = kwargs.pop(&#39;num_workers&#39;)
+
+        if dataset_type == DatasetType.MUSIC:
+            dataset = data.music_dataset.MusicDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.SOUND:
+            dataset = data.sound_dataset.SoundDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.AUDIO:
+            dataset = data.info_audio_dataset.InfoAudioDataset.from_meta(path, return_info=return_info, **kwargs)
+        else:
+            raise ValueError(f&#34;Dataset type is unsupported: {dataset_type}&#34;)
+
+        loader = get_loader(
+            dataset,
+            num_samples,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            seed=seed,
+            collate_fn=dataset.collater if return_info else None,
+            shuffle=shuffle,
+        )
+        dataloaders[split] = loader
+
+    return dataloaders</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_balancer"><code class="name flex">
+<span>def <span class="ident">get_balancer</span></span>(<span>loss_weights: Dict[str, float], cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.losses.balancer.Balancer" href="../losses/balancer.html#audiocraft.losses.balancer.Balancer">Balancer</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate loss balancer from configuration for the provided weights.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_balancer(loss_weights: tp.Dict[str, float], cfg: omegaconf.DictConfig) -&gt; losses.Balancer:
+    &#34;&#34;&#34;Instantiate loss balancer from configuration for the provided weights.&#34;&#34;&#34;
+    kwargs: tp.Dict[str, tp.Any] = dict_from_config(cfg)
+    return losses.Balancer(loss_weights, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_chroma_cosine_similarity"><code class="name flex">
+<span>def <span class="ident">get_chroma_cosine_similarity</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric" href="../metrics/chroma_cosinesim.html#audiocraft.metrics.chroma_cosinesim.ChromaCosineSimilarityMetric">ChromaCosineSimilarityMetric</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate Chroma Cosine Similarity metric from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_chroma_cosine_similarity(cfg: omegaconf.DictConfig) -&gt; metrics.ChromaCosineSimilarityMetric:
+    &#34;&#34;&#34;Instantiate Chroma Cosine Similarity metric from config.&#34;&#34;&#34;
+    assert cfg.model == &#39;chroma_base&#39;, &#34;Only support &#39;chroma_base&#39; method for chroma cosine similarity metric&#34;
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return metrics.ChromaCosineSimilarityMetric(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_ema"><code class="name flex">
+<span>def <span class="ident">get_ema</span></span>(<span>module_dict: torch.nn.modules.container.ModuleDict, cfg: omegaconf.dictconfig.DictConfig) ‑> Optional[<a title="audiocraft.optim.ema.ModuleDictEMA" href="../optim/ema.html#audiocraft.optim.ema.ModuleDictEMA">ModuleDictEMA</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Initialize Exponential Moving Average.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>module_dict</code></strong> :&ensp;<code>nn.ModuleDict</code></dt>
+<dd>ModuleDict for which to compute the EMA.</dd>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>omegaconf.DictConfig</code></dt>
+<dd>Optim EMA configuration.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>optim.ModuleDictEMA</code></dt>
+<dd>EMA version of the ModuleDict.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_ema(module_dict: nn.ModuleDict, cfg: omegaconf.DictConfig) -&gt; tp.Optional[optim.ModuleDictEMA]:
+    &#34;&#34;&#34;Initialize Exponential Moving Average.
+
+    Args:
+        module_dict (nn.ModuleDict): ModuleDict for which to compute the EMA.
+        cfg (omegaconf.DictConfig): Optim EMA configuration.
+    Returns:
+        optim.ModuleDictEMA: EMA version of the ModuleDict.
+    &#34;&#34;&#34;
+    kw: tp.Dict[str, tp.Any] = dict(cfg)
+    use = kw.pop(&#39;use&#39;, False)
+    decay = kw.pop(&#39;decay&#39;, None)
+    device = kw.pop(&#39;device&#39;, None)
+    if not use:
+        return None
+    if len(module_dict) == 0:
+        raise ValueError(&#34;Trying to build EMA but an empty module_dict source is provided!&#34;)
+    ema_module = optim.ModuleDictEMA(module_dict, decay=decay, device=device)
+    return ema_module</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_fad"><code class="name flex">
+<span>def <span class="ident">get_fad</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.metrics.fad.FrechetAudioDistanceMetric" href="../metrics/fad.html#audiocraft.metrics.fad.FrechetAudioDistanceMetric">FrechetAudioDistanceMetric</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate Frechet Audio Distance metric from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_fad(cfg: omegaconf.DictConfig) -&gt; metrics.FrechetAudioDistanceMetric:
+    &#34;&#34;&#34;Instantiate Frechet Audio Distance metric from config.&#34;&#34;&#34;
+    kwargs = dict_from_config(cfg.tf)
+    xp = dora.get_xp()
+    kwargs[&#39;log_folder&#39;] = xp.folder
+    return metrics.FrechetAudioDistanceMetric(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_kldiv"><code class="name flex">
+<span>def <span class="ident">get_kldiv</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.metrics.kld.KLDivergenceMetric" href="../metrics/kld.html#audiocraft.metrics.kld.KLDivergenceMetric">KLDivergenceMetric</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate KL-Divergence metric from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_kldiv(cfg: omegaconf.DictConfig) -&gt; metrics.KLDivergenceMetric:
+    &#34;&#34;&#34;Instantiate KL-Divergence metric from config.&#34;&#34;&#34;
+    kld_metrics = {
+        &#39;passt&#39;: metrics.PasstKLDivergenceMetric,
+    }
+    klass = kld_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_loss"><code class="name flex">
+<span>def <span class="ident">get_loss</span></span>(<span>loss_name: str, cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate loss from configuration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_loss(loss_name: str, cfg: omegaconf.DictConfig):
+    &#34;&#34;&#34;Instantiate loss from configuration.&#34;&#34;&#34;
+    klass = {
+        &#39;l1&#39;: torch.nn.L1Loss,
+        &#39;l2&#39;: torch.nn.MSELoss,
+        &#39;mel&#39;: losses.MelSpectrogramL1Loss,
+        &#39;mrstft&#39;: losses.MRSTFTLoss,
+        &#39;msspec&#39;: losses.MultiScaleMelSpectrogramLoss,
+        &#39;sisnr&#39;: losses.SISNR,
+    }[loss_name]
+    kwargs = dict(getattr(cfg, loss_name))
+    return klass(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_lr_scheduler"><code class="name flex">
+<span>def <span class="ident">get_lr_scheduler</span></span>(<span>optimizer: torch.optim.optimizer.Optimizer, cfg: omegaconf.dictconfig.DictConfig, total_updates: int) ‑> Optional[torch.optim.lr_scheduler.LRScheduler]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build torch learning rate scheduler from config and associated optimizer.
+Supported learning rate schedulers: ExponentialLRScheduler, PlateauLRScheduler</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>optimizer</code></strong> :&ensp;<code>torch.optim.Optimizer</code></dt>
+<dd>Optimizer.</dd>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>DictConfig</code></dt>
+<dd>Schedule-related configuration.</dd>
+<dt><strong><code>total_updates</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total number of updates.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>torch.optim.Optimizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lr_scheduler(optimizer: torch.optim.Optimizer,
+                     cfg: omegaconf.DictConfig,
+                     total_updates: int) -&gt; tp.Optional[LRScheduler]:
+    &#34;&#34;&#34;Build torch learning rate scheduler from config and associated optimizer.
+    Supported learning rate schedulers: ExponentialLRScheduler, PlateauLRScheduler
+
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer.
+        cfg (DictConfig): Schedule-related configuration.
+        total_updates (int): Total number of updates.
+    Returns:
+        torch.optim.Optimizer.
+    &#34;&#34;&#34;
+    if &#39;lr_scheduler&#39; not in cfg:
+        raise KeyError(&#34;LR Scheduler not found in config&#34;)
+
+    lr_sched: tp.Optional[LRScheduler] = None
+    if cfg.lr_scheduler == &#39;step&#39;:
+        lr_sched = torch.optim.lr_scheduler.StepLR(optimizer, **cfg.step)
+    elif cfg.lr_scheduler == &#39;exponential&#39;:
+        lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.exponential)
+    elif cfg.lr_scheduler == &#39;cosine&#39;:
+        kwargs = dict_from_config(cfg.cosine)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.CosineLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == &#39;polynomial_decay&#39;:
+        kwargs = dict_from_config(cfg.polynomial_decay)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.PolynomialDecayLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == &#39;inverse_sqrt&#39;:
+        kwargs = dict_from_config(cfg.inverse_sqrt)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.InverseSquareRootLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler == &#39;linear_warmup&#39;:
+        kwargs = dict_from_config(cfg.linear_warmup)
+        warmup_steps = kwargs.pop(&#39;warmup&#39;)
+        lr_sched = optim.LinearWarmupLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler is not None:
+        raise ValueError(f&#34;Unsupported LR Scheduler: {cfg.lr_scheduler}&#34;)
+    return lr_sched</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_optim_parameter_groups"><code class="name flex">
+<span>def <span class="ident">get_optim_parameter_groups</span></span>(<span>model: torch.nn.modules.module.Module)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create parameter groups for the model using the appropriate method
+if defined for each modules, to create the different groups.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>model</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>torch model</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>List of parameter groups</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_optim_parameter_groups(model: nn.Module):
+    &#34;&#34;&#34;Create parameter groups for the model using the appropriate method
+    if defined for each modules, to create the different groups.
+
+    Args:
+        model (nn.Module): torch model
+    Returns:
+        List of parameter groups
+    &#34;&#34;&#34;
+    seen_params: tp.Set[nn.parameter.Parameter] = set()
+    other_params = []
+    groups = []
+    for name, module in model.named_modules():
+        if hasattr(module, &#39;make_optim_group&#39;):
+            group = module.make_optim_group()
+            params = set(group[&#39;params&#39;])
+            assert params.isdisjoint(seen_params)
+            seen_params |= set(params)
+            groups.append(group)
+    for param in model.parameters():
+        if param not in seen_params:
+            other_params.append(param)
+    groups.insert(0, {&#39;params&#39;: other_params})
+    parameters = groups
+    return parameters</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_optimizer"><code class="name flex">
+<span>def <span class="ident">get_optimizer</span></span>(<span>params: Union[torch.nn.modules.module.Module, Iterable[torch.Tensor]], cfg: omegaconf.dictconfig.DictConfig) ‑> torch.optim.optimizer.Optimizer</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build torch optimizer from config and set of parameters.
+Supported optimizers: Adam, AdamW</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>params</code></strong> :&ensp;<code>nn.Module</code> or <code>iterable</code> of <code>torch.Tensor</code></dt>
+<dd>Parameters to optimize.</dd>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>DictConfig</code></dt>
+<dd>Optimization-related configuration.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>torch.optim.Optimizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_optimizer(params: tp.Union[nn.Module, tp.Iterable[torch.Tensor]], cfg: omegaconf.DictConfig) -&gt; Optimizer:
+    &#34;&#34;&#34;Build torch optimizer from config and set of parameters.
+    Supported optimizers: Adam, AdamW
+
+    Args:
+        params (nn.Module or iterable of torch.Tensor): Parameters to optimize.
+        cfg (DictConfig): Optimization-related configuration.
+    Returns:
+        torch.optim.Optimizer.
+    &#34;&#34;&#34;
+    if &#39;optimizer&#39; not in cfg:
+        if getattr(cfg, &#39;optim&#39;, None) is not None:
+            raise KeyError(&#34;Optimizer not found in config. Try instantiating optimizer from cfg.optim?&#34;)
+        else:
+            raise KeyError(&#34;Optimizer not found in config.&#34;)
+
+    parameters = get_optim_parameter_groups(params) if isinstance(params, nn.Module) else params
+    optimizer: torch.optim.Optimizer
+    if cfg.optimizer == &#39;adam&#39;:
+        optimizer = torch.optim.Adam(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == &#39;adamw&#39;:
+        optimizer = torch.optim.AdamW(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == &#39;dadam&#39;:
+        optimizer = optim.DAdaptAdam(parameters, lr=cfg.lr, **cfg.adam)
+    else:
+        raise ValueError(f&#34;Unsupported LR Scheduler: {cfg.lr_scheduler}&#34;)
+    return optimizer</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_solver"><code class="name flex">
+<span>def <span class="ident">get_solver</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate solver from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_solver(cfg: omegaconf.DictConfig) -&gt; StandardSolver:
+    &#34;&#34;&#34;Instantiate solver from config.&#34;&#34;&#34;
+    from .audiogen import AudioGenSolver
+    from .compression import CompressionSolver
+    from .musicgen import MusicGenSolver
+    from .diffusion import DiffusionSolver
+    klass = {
+        &#39;compression&#39;: CompressionSolver,
+        &#39;musicgen&#39;: MusicGenSolver,
+        &#39;audiogen&#39;: AudioGenSolver,
+        &#39;lm&#39;: MusicGenSolver,  # backward compatibility
+        &#39;diffusion&#39;: DiffusionSolver,
+        &#39;sound_lm&#39;: AudioGenSolver,  # backward compatibility
+    }[cfg.solver]
+    return klass(cfg)  # type: ignore</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_text_consistency"><code class="name flex">
+<span>def <span class="ident">get_text_consistency</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.metrics.clap_consistency.TextConsistencyMetric" href="../metrics/clap_consistency.html#audiocraft.metrics.clap_consistency.TextConsistencyMetric">TextConsistencyMetric</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate Text Consistency metric from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_text_consistency(cfg: omegaconf.DictConfig) -&gt; metrics.TextConsistencyMetric:
+    &#34;&#34;&#34;Instantiate Text Consistency metric from config.&#34;&#34;&#34;
+    text_consistency_metrics = {
+        &#39;clap&#39;: metrics.CLAPTextConsistencyMetric
+    }
+    klass = text_consistency_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.builders.get_visqol"><code class="name flex">
+<span>def <span class="ident">get_visqol</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.metrics.visqol.ViSQOL" href="../metrics/visqol.html#audiocraft.metrics.visqol.ViSQOL">ViSQOL</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate ViSQOL metric from config.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_visqol(cfg: omegaconf.DictConfig) -&gt; metrics.ViSQOL:
+    &#34;&#34;&#34;Instantiate ViSQOL metric from config.&#34;&#34;&#34;
+    kwargs = dict_from_config(cfg)
+    return metrics.ViSQOL(**kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.builders.DatasetType"><code class="flex name class">
+<span>class <span class="ident">DatasetType</span></span>
+<span>(</span><span>value, names=None, *, module=None, qualname=None, type=None, start=1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>An enumeration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DatasetType(Enum):
+    AUDIO = &#34;audio&#34;
+    MUSIC = &#34;music&#34;
+    SOUND = &#34;sound&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>enum.Enum</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.solvers.builders.DatasetType.AUDIO"><code class="name">var <span class="ident">AUDIO</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.solvers.builders.DatasetType.MUSIC"><code class="name">var <span class="ident">MUSIC</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.solvers.builders.DatasetType.SOUND"><code class="name">var <span class="ident">SOUND</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.solvers.builders.get_adversarial_losses" href="#audiocraft.solvers.builders.get_adversarial_losses">get_adversarial_losses</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_adversary" href="#audiocraft.solvers.builders.get_adversary">get_adversary</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_audio_datasets" href="#audiocraft.solvers.builders.get_audio_datasets">get_audio_datasets</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_balancer" href="#audiocraft.solvers.builders.get_balancer">get_balancer</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_chroma_cosine_similarity" href="#audiocraft.solvers.builders.get_chroma_cosine_similarity">get_chroma_cosine_similarity</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_ema" href="#audiocraft.solvers.builders.get_ema">get_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_fad" href="#audiocraft.solvers.builders.get_fad">get_fad</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_kldiv" href="#audiocraft.solvers.builders.get_kldiv">get_kldiv</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_loss" href="#audiocraft.solvers.builders.get_loss">get_loss</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_lr_scheduler" href="#audiocraft.solvers.builders.get_lr_scheduler">get_lr_scheduler</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_optim_parameter_groups" href="#audiocraft.solvers.builders.get_optim_parameter_groups">get_optim_parameter_groups</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_optimizer" href="#audiocraft.solvers.builders.get_optimizer">get_optimizer</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_solver" href="#audiocraft.solvers.builders.get_solver">get_solver</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_text_consistency" href="#audiocraft.solvers.builders.get_text_consistency">get_text_consistency</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.get_visqol" href="#audiocraft.solvers.builders.get_visqol">get_visqol</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.builders.DatasetType" href="#audiocraft.solvers.builders.DatasetType">DatasetType</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.builders.DatasetType.AUDIO" href="#audiocraft.solvers.builders.DatasetType.AUDIO">AUDIO</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.DatasetType.MUSIC" href="#audiocraft.solvers.builders.DatasetType.MUSIC">MUSIC</a></code></li>
+<li><code><a title="audiocraft.solvers.builders.DatasetType.SOUND" href="#audiocraft.solvers.builders.DatasetType.SOUND">SOUND</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/compression.html b/api_docs/audiocraft/solvers/compression.html
new file mode 100644
index 00000000..2dc643fc
--- /dev/null
+++ b/api_docs/audiocraft/solvers/compression.html
@@ -0,0 +1,1010 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.compression API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.compression</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import multiprocessing
+from pathlib import Path
+import typing as tp
+
+import flashy
+import omegaconf
+import torch
+from torch import nn
+
+from . import base, builders
+from .. import models, quantization
+from ..utils import checkpoint
+from ..utils.samples.manager import SampleManager
+from ..utils.utils import get_pool_executor
+
+
+logger = logging.getLogger(__name__)
+
+
+class CompressionSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for compression task.
+
+    The compression task combines a set of perceptual and objective losses
+    to train an EncodecModel (composed of an encoder-decoder and a quantizer)
+    to perform high fidelity audio reconstruction.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.rng: torch.Generator  # set at each epoch
+        self.adv_losses = builders.get_adversarial_losses(self.cfg)
+        self.aux_losses = nn.ModuleDict()
+        self.info_losses = nn.ModuleDict()
+        assert not cfg.fsdp.use, &#34;FSDP not supported by CompressionSolver.&#34;
+        loss_weights = dict()
+        for loss_name, weight in self.cfg.losses.items():
+            if loss_name in [&#39;adv&#39;, &#39;feat&#39;]:
+                for adv_name, _ in self.adv_losses.items():
+                    loss_weights[f&#39;{loss_name}_{adv_name}&#39;] = weight
+            elif weight &gt; 0:
+                self.aux_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+                loss_weights[loss_name] = weight
+            else:
+                self.info_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+        self.balancer = builders.get_balancer(loss_weights, self.cfg.balancer)
+        self.register_stateful(&#39;adv_losses&#39;)
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        # best model is the last for the compression model
+        return None
+
+    def build_model(self):
+        &#34;&#34;&#34;Instantiate model and optimizer.&#34;&#34;&#34;
+        # Model and optimizer
+        self.model = models.builders.get_compression_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.register_ema(&#39;model&#39;)
+
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+
+    def show(self):
+        &#34;&#34;&#34;Show the compression model and employed adversarial loss.&#34;&#34;&#34;
+        self.logger.info(f&#34;Compression model with {self.model.quantizer.total_codebooks} codebooks:&#34;)
+        self.log_model_summary(self.model)
+        self.logger.info(&#34;Adversarial loss:&#34;)
+        self.log_model_summary(self.adv_losses)
+        self.logger.info(&#34;Auxiliary losses:&#34;)
+        self.logger.info(self.aux_losses)
+        self.logger.info(&#34;Info losses:&#34;)
+        self.logger.info(self.info_losses)
+
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        x = batch.to(self.device)
+        y = x.clone()
+
+        qres = self.model(x)
+        assert isinstance(qres, quantization.QuantizedResult)
+        y_pred = qres.x
+        # Log bandwidth in kb/s
+        metrics[&#39;bandwidth&#39;] = qres.bandwidth.mean()
+
+        if self.is_training:
+            d_losses: dict = {}
+            if len(self.adv_losses) &gt; 0 and torch.rand(1, generator=self.rng).item() &lt;= 1 / self.cfg.adversarial.every:
+                for adv_name, adversary in self.adv_losses.items():
+                    disc_loss = adversary.train_adv(y_pred, y)
+                    d_losses[f&#39;d_{adv_name}&#39;] = disc_loss
+                metrics[&#39;d_loss&#39;] = torch.sum(torch.stack(list(d_losses.values())))
+            metrics.update(d_losses)
+
+        balanced_losses: dict = {}
+        other_losses: dict = {}
+
+        # penalty from quantization
+        if qres.penalty is not None and qres.penalty.requires_grad:
+            other_losses[&#39;penalty&#39;] = qres.penalty  # penalty term from the quantizer
+
+        # adversarial losses
+        for adv_name, adversary in self.adv_losses.items():
+            adv_loss, feat_loss = adversary(y_pred, y)
+            balanced_losses[f&#39;adv_{adv_name}&#39;] = adv_loss
+            balanced_losses[f&#39;feat_{adv_name}&#39;] = feat_loss
+
+        # auxiliary losses
+        for loss_name, criterion in self.aux_losses.items():
+            loss = criterion(y_pred, y)
+            balanced_losses[loss_name] = loss
+
+        # weighted losses
+        metrics.update(balanced_losses)
+        metrics.update(other_losses)
+        metrics.update(qres.metrics)
+
+        if self.is_training:
+            # backprop losses that are not handled by balancer
+            other_loss = torch.tensor(0., device=self.device)
+            if &#39;penalty&#39; in other_losses:
+                other_loss += other_losses[&#39;penalty&#39;]
+            if other_loss.requires_grad:
+                other_loss.backward(retain_graph=True)
+                ratio1 = sum(p.grad.data.norm(p=2).pow(2)
+                             for p in self.model.parameters() if p.grad is not None)
+                assert isinstance(ratio1, torch.Tensor)
+                metrics[&#39;ratio1&#39;] = ratio1.sqrt()
+
+            # balancer losses backward, returns effective training loss
+            # with effective weights at the current batch.
+            metrics[&#39;g_loss&#39;] = self.balancer.backward(balanced_losses, y_pred)
+            # add metrics corresponding to weight ratios
+            metrics.update(self.balancer.metrics)
+            ratio2 = sum(p.grad.data.norm(p=2).pow(2)
+                         for p in self.model.parameters() if p.grad is not None)
+            assert isinstance(ratio2, torch.Tensor)
+            metrics[&#39;ratio2&#39;] = ratio2.sqrt()
+
+            # optim
+            flashy.distrib.sync_model(self.model)
+            if self.cfg.optim.max_norm:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model.parameters(), self.cfg.optim.max_norm
+                )
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+        # informative losses only
+        info_losses: dict = {}
+        with torch.no_grad():
+            for loss_name, criterion in self.info_losses.items():
+                loss = criterion(y_pred, y)
+                info_losses[loss_name] = loss
+
+        metrics.update(info_losses)
+
+        # aggregated GAN losses: this is useful to report adv and feat across different adversarial loss setups
+        adv_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith(&#39;adv&#39;)]
+        if len(adv_losses) &gt; 0:
+            metrics[&#39;adv&#39;] = torch.sum(torch.stack(adv_losses))
+        feat_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith(&#39;feat&#39;)]
+        if len(feat_losses) &gt; 0:
+            metrics[&#39;feat&#39;] = torch.sum(torch.stack(feat_losses))
+
+        return metrics
+
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        # run epoch
+        super().run_epoch()
+
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage. Runs audio reconstruction evaluation.&#34;&#34;&#34;
+        self.model.eval()
+        evaluate_stage_name = str(self.current_stage)
+
+        loader = self.dataloaders[&#39;evaluate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+        average = flashy.averager()
+
+        pendings = []
+        ctx = multiprocessing.get_context(&#39;spawn&#39;)
+        with get_pool_executor(self.cfg.evaluate.num_workers, mp_context=ctx) as pool:
+            for idx, batch in enumerate(lp):
+                x = batch.to(self.device)
+                with torch.no_grad():
+                    qres = self.model(x)
+
+                y_pred = qres.x.cpu()
+                y = batch.cpu()  # should already be on CPU but just in case
+                pendings.append(pool.submit(evaluate_audio_reconstruction, y_pred, y, self.cfg))
+
+            metrics_lp = self.log_progress(f&#39;{evaluate_stage_name} metrics&#39;, pendings, updates=self.log_updates)
+            for pending in metrics_lp:
+                metrics = pending.result()
+                metrics = average(metrics)
+
+        metrics = flashy.distrib.average_metrics(metrics, len(loader))
+        return metrics
+
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        self.model.eval()
+        sample_manager = SampleManager(self.xp, map_reference_to_sample_id=True)
+        generate_stage_name = str(self.current_stage)
+
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            with torch.no_grad():
+                qres = self.model(reference)
+            assert isinstance(qres, quantization.QuantizedResult)
+
+            reference = reference.cpu()
+            estimate = qres.x.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+
+        flashy.distrib.barrier()
+
+    def load_from_pretrained(self, name: str) -&gt; dict:
+        model = models.CompressionModel.get_pretrained(name)
+        if isinstance(model, models.DAC):
+            raise RuntimeError(&#34;Cannot fine tune a DAC model.&#34;)
+        elif isinstance(model, models.HFEncodecCompressionModel):
+            self.logger.warning(&#39;Trying to automatically convert a HuggingFace model &#39;
+                                &#39;to AudioCraft, this might fail!&#39;)
+            state = model.model.state_dict()
+            new_state = {}
+            for k, v in state.items():
+                if k.startswith(&#39;decoder.layers&#39;) and &#39;.conv.&#39; in k and &#39;.block.&#39; not in k:
+                    # We need to determine if this a convtr or a regular conv.
+                    layer = int(k.split(&#39;.&#39;)[2])
+                    if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):
+
+                        k = k.replace(&#39;.conv.&#39;, &#39;.convtr.&#39;)
+                k = k.replace(&#39;encoder.layers.&#39;, &#39;encoder.model.&#39;)
+                k = k.replace(&#39;decoder.layers.&#39;, &#39;decoder.model.&#39;)
+                k = k.replace(&#39;conv.&#39;, &#39;conv.conv.&#39;)
+                k = k.replace(&#39;convtr.&#39;, &#39;convtr.convtr.&#39;)
+                k = k.replace(&#39;quantizer.layers.&#39;, &#39;quantizer.vq.layers.&#39;)
+                k = k.replace(&#39;.codebook.&#39;, &#39;._codebook.&#39;)
+                new_state[k] = v
+            state = new_state
+        elif isinstance(model, models.EncodecModel):
+            state = model.state_dict()
+        else:
+            raise RuntimeError(f&#34;Cannot fine tune model type {type(model)}.&#34;)
+        return {
+            &#39;best_state&#39;: {&#39;model&#39;: state}
+        }
+
+    @staticmethod
+    def model_from_checkpoint(checkpoint_path: tp.Union[Path, str],
+                              device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+        &#34;&#34;&#34;Instantiate a CompressionModel from a given checkpoint path or dora sig.
+        This method is a convenient endpoint to load a CompressionModel to use in other solvers.
+
+        Args:
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+                This also supports pre-trained models by using a path of the form //pretrained/NAME.
+                See `model_from_pretrained` for a list of supported pretrained models.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        &#34;&#34;&#34;
+        checkpoint_path = str(checkpoint_path)
+        if checkpoint_path.startswith(&#39;//pretrained/&#39;):
+            name = checkpoint_path.split(&#39;/&#39;, 3)[-1]
+            return models.CompressionModel.get_pretrained(name, device)
+        logger = logging.getLogger(__name__)
+        logger.info(f&#34;Loading compression model from checkpoint: {checkpoint_path}&#34;)
+        _checkpoint_path = checkpoint.resolve_checkpoint_path(checkpoint_path, use_fsdp=False)
+        assert _checkpoint_path is not None, f&#34;Could not resolve compression model checkpoint path: {checkpoint_path}&#34;
+        state = checkpoint.load_checkpoint(_checkpoint_path)
+        assert state is not None and &#39;xp.cfg&#39; in state, f&#34;Could not load compression model from ckpt: {checkpoint_path}&#34;
+        cfg = state[&#39;xp.cfg&#39;]
+        cfg.device = device
+        compression_model = models.builders.get_compression_model(cfg).to(device)
+        assert compression_model.sample_rate == cfg.sample_rate, &#34;Compression model sample rate should match&#34;
+
+        assert &#39;best_state&#39; in state and state[&#39;best_state&#39;] != {}
+        assert &#39;exported&#39; not in state, &#34;When loading an exported checkpoint, use the //pretrained/ prefix.&#34;
+        compression_model.load_state_dict(state[&#39;best_state&#39;][&#39;model&#39;])
+        compression_model.eval()
+        logger.info(&#34;Compression model loaded!&#34;)
+        return compression_model
+
+    @staticmethod
+    def wrapped_model_from_checkpoint(cfg: omegaconf.DictConfig,
+                                      checkpoint_path: tp.Union[Path, str],
+                                      device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+        &#34;&#34;&#34;Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.
+
+        Args:
+            cfg (omegaconf.DictConfig): Configuration to read from for wrapped mode.
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        &#34;&#34;&#34;
+        compression_model = CompressionSolver.model_from_checkpoint(checkpoint_path, device)
+        compression_model = models.builders.get_wrapped_compression_model(compression_model, cfg)
+        return compression_model
+
+
+def evaluate_audio_reconstruction(y_pred: torch.Tensor, y: torch.Tensor, cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Audio reconstruction evaluation method that can be conveniently pickled.&#34;&#34;&#34;
+    metrics = {}
+    if cfg.evaluate.metrics.visqol:
+        visqol = builders.get_visqol(cfg.metrics.visqol)
+        metrics[&#39;visqol&#39;] = visqol(y_pred, y, cfg.sample_rate)
+    sisnr = builders.get_loss(&#39;sisnr&#39;, cfg)
+    metrics[&#39;sisnr&#39;] = sisnr(y_pred, y)
+    return metrics</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.solvers.compression.evaluate_audio_reconstruction"><code class="name flex">
+<span>def <span class="ident">evaluate_audio_reconstruction</span></span>(<span>y_pred: torch.Tensor, y: torch.Tensor, cfg: omegaconf.dictconfig.DictConfig) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Audio reconstruction evaluation method that can be conveniently pickled.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def evaluate_audio_reconstruction(y_pred: torch.Tensor, y: torch.Tensor, cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Audio reconstruction evaluation method that can be conveniently pickled.&#34;&#34;&#34;
+    metrics = {}
+    if cfg.evaluate.metrics.visqol:
+        visqol = builders.get_visqol(cfg.metrics.visqol)
+        metrics[&#39;visqol&#39;] = visqol(y_pred, y, cfg.sample_rate)
+    sisnr = builders.get_loss(&#39;sisnr&#39;, cfg)
+    metrics[&#39;sisnr&#39;] = sisnr(y_pred, y)
+    return metrics</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.compression.CompressionSolver"><code class="flex name class">
+<span>class <span class="ident">CompressionSolver</span></span>
+<span>(</span><span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Solver for compression task.</p>
+<p>The compression task combines a set of perceptual and objective losses
+to train an EncodecModel (composed of an encoder-decoder and a quantizer)
+to perform high fidelity audio reconstruction.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CompressionSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for compression task.
+
+    The compression task combines a set of perceptual and objective losses
+    to train an EncodecModel (composed of an encoder-decoder and a quantizer)
+    to perform high fidelity audio reconstruction.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.rng: torch.Generator  # set at each epoch
+        self.adv_losses = builders.get_adversarial_losses(self.cfg)
+        self.aux_losses = nn.ModuleDict()
+        self.info_losses = nn.ModuleDict()
+        assert not cfg.fsdp.use, &#34;FSDP not supported by CompressionSolver.&#34;
+        loss_weights = dict()
+        for loss_name, weight in self.cfg.losses.items():
+            if loss_name in [&#39;adv&#39;, &#39;feat&#39;]:
+                for adv_name, _ in self.adv_losses.items():
+                    loss_weights[f&#39;{loss_name}_{adv_name}&#39;] = weight
+            elif weight &gt; 0:
+                self.aux_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+                loss_weights[loss_name] = weight
+            else:
+                self.info_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+        self.balancer = builders.get_balancer(loss_weights, self.cfg.balancer)
+        self.register_stateful(&#39;adv_losses&#39;)
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        # best model is the last for the compression model
+        return None
+
+    def build_model(self):
+        &#34;&#34;&#34;Instantiate model and optimizer.&#34;&#34;&#34;
+        # Model and optimizer
+        self.model = models.builders.get_compression_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.register_ema(&#39;model&#39;)
+
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+
+    def show(self):
+        &#34;&#34;&#34;Show the compression model and employed adversarial loss.&#34;&#34;&#34;
+        self.logger.info(f&#34;Compression model with {self.model.quantizer.total_codebooks} codebooks:&#34;)
+        self.log_model_summary(self.model)
+        self.logger.info(&#34;Adversarial loss:&#34;)
+        self.log_model_summary(self.adv_losses)
+        self.logger.info(&#34;Auxiliary losses:&#34;)
+        self.logger.info(self.aux_losses)
+        self.logger.info(&#34;Info losses:&#34;)
+        self.logger.info(self.info_losses)
+
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        x = batch.to(self.device)
+        y = x.clone()
+
+        qres = self.model(x)
+        assert isinstance(qres, quantization.QuantizedResult)
+        y_pred = qres.x
+        # Log bandwidth in kb/s
+        metrics[&#39;bandwidth&#39;] = qres.bandwidth.mean()
+
+        if self.is_training:
+            d_losses: dict = {}
+            if len(self.adv_losses) &gt; 0 and torch.rand(1, generator=self.rng).item() &lt;= 1 / self.cfg.adversarial.every:
+                for adv_name, adversary in self.adv_losses.items():
+                    disc_loss = adversary.train_adv(y_pred, y)
+                    d_losses[f&#39;d_{adv_name}&#39;] = disc_loss
+                metrics[&#39;d_loss&#39;] = torch.sum(torch.stack(list(d_losses.values())))
+            metrics.update(d_losses)
+
+        balanced_losses: dict = {}
+        other_losses: dict = {}
+
+        # penalty from quantization
+        if qres.penalty is not None and qres.penalty.requires_grad:
+            other_losses[&#39;penalty&#39;] = qres.penalty  # penalty term from the quantizer
+
+        # adversarial losses
+        for adv_name, adversary in self.adv_losses.items():
+            adv_loss, feat_loss = adversary(y_pred, y)
+            balanced_losses[f&#39;adv_{adv_name}&#39;] = adv_loss
+            balanced_losses[f&#39;feat_{adv_name}&#39;] = feat_loss
+
+        # auxiliary losses
+        for loss_name, criterion in self.aux_losses.items():
+            loss = criterion(y_pred, y)
+            balanced_losses[loss_name] = loss
+
+        # weighted losses
+        metrics.update(balanced_losses)
+        metrics.update(other_losses)
+        metrics.update(qres.metrics)
+
+        if self.is_training:
+            # backprop losses that are not handled by balancer
+            other_loss = torch.tensor(0., device=self.device)
+            if &#39;penalty&#39; in other_losses:
+                other_loss += other_losses[&#39;penalty&#39;]
+            if other_loss.requires_grad:
+                other_loss.backward(retain_graph=True)
+                ratio1 = sum(p.grad.data.norm(p=2).pow(2)
+                             for p in self.model.parameters() if p.grad is not None)
+                assert isinstance(ratio1, torch.Tensor)
+                metrics[&#39;ratio1&#39;] = ratio1.sqrt()
+
+            # balancer losses backward, returns effective training loss
+            # with effective weights at the current batch.
+            metrics[&#39;g_loss&#39;] = self.balancer.backward(balanced_losses, y_pred)
+            # add metrics corresponding to weight ratios
+            metrics.update(self.balancer.metrics)
+            ratio2 = sum(p.grad.data.norm(p=2).pow(2)
+                         for p in self.model.parameters() if p.grad is not None)
+            assert isinstance(ratio2, torch.Tensor)
+            metrics[&#39;ratio2&#39;] = ratio2.sqrt()
+
+            # optim
+            flashy.distrib.sync_model(self.model)
+            if self.cfg.optim.max_norm:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model.parameters(), self.cfg.optim.max_norm
+                )
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+        # informative losses only
+        info_losses: dict = {}
+        with torch.no_grad():
+            for loss_name, criterion in self.info_losses.items():
+                loss = criterion(y_pred, y)
+                info_losses[loss_name] = loss
+
+        metrics.update(info_losses)
+
+        # aggregated GAN losses: this is useful to report adv and feat across different adversarial loss setups
+        adv_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith(&#39;adv&#39;)]
+        if len(adv_losses) &gt; 0:
+            metrics[&#39;adv&#39;] = torch.sum(torch.stack(adv_losses))
+        feat_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith(&#39;feat&#39;)]
+        if len(feat_losses) &gt; 0:
+            metrics[&#39;feat&#39;] = torch.sum(torch.stack(feat_losses))
+
+        return metrics
+
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        # run epoch
+        super().run_epoch()
+
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage. Runs audio reconstruction evaluation.&#34;&#34;&#34;
+        self.model.eval()
+        evaluate_stage_name = str(self.current_stage)
+
+        loader = self.dataloaders[&#39;evaluate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+        average = flashy.averager()
+
+        pendings = []
+        ctx = multiprocessing.get_context(&#39;spawn&#39;)
+        with get_pool_executor(self.cfg.evaluate.num_workers, mp_context=ctx) as pool:
+            for idx, batch in enumerate(lp):
+                x = batch.to(self.device)
+                with torch.no_grad():
+                    qres = self.model(x)
+
+                y_pred = qres.x.cpu()
+                y = batch.cpu()  # should already be on CPU but just in case
+                pendings.append(pool.submit(evaluate_audio_reconstruction, y_pred, y, self.cfg))
+
+            metrics_lp = self.log_progress(f&#39;{evaluate_stage_name} metrics&#39;, pendings, updates=self.log_updates)
+            for pending in metrics_lp:
+                metrics = pending.result()
+                metrics = average(metrics)
+
+        metrics = flashy.distrib.average_metrics(metrics, len(loader))
+        return metrics
+
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        self.model.eval()
+        sample_manager = SampleManager(self.xp, map_reference_to_sample_id=True)
+        generate_stage_name = str(self.current_stage)
+
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            with torch.no_grad():
+                qres = self.model(reference)
+            assert isinstance(qres, quantization.QuantizedResult)
+
+            reference = reference.cpu()
+            estimate = qres.x.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+
+        flashy.distrib.barrier()
+
+    def load_from_pretrained(self, name: str) -&gt; dict:
+        model = models.CompressionModel.get_pretrained(name)
+        if isinstance(model, models.DAC):
+            raise RuntimeError(&#34;Cannot fine tune a DAC model.&#34;)
+        elif isinstance(model, models.HFEncodecCompressionModel):
+            self.logger.warning(&#39;Trying to automatically convert a HuggingFace model &#39;
+                                &#39;to AudioCraft, this might fail!&#39;)
+            state = model.model.state_dict()
+            new_state = {}
+            for k, v in state.items():
+                if k.startswith(&#39;decoder.layers&#39;) and &#39;.conv.&#39; in k and &#39;.block.&#39; not in k:
+                    # We need to determine if this a convtr or a regular conv.
+                    layer = int(k.split(&#39;.&#39;)[2])
+                    if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):
+
+                        k = k.replace(&#39;.conv.&#39;, &#39;.convtr.&#39;)
+                k = k.replace(&#39;encoder.layers.&#39;, &#39;encoder.model.&#39;)
+                k = k.replace(&#39;decoder.layers.&#39;, &#39;decoder.model.&#39;)
+                k = k.replace(&#39;conv.&#39;, &#39;conv.conv.&#39;)
+                k = k.replace(&#39;convtr.&#39;, &#39;convtr.convtr.&#39;)
+                k = k.replace(&#39;quantizer.layers.&#39;, &#39;quantizer.vq.layers.&#39;)
+                k = k.replace(&#39;.codebook.&#39;, &#39;._codebook.&#39;)
+                new_state[k] = v
+            state = new_state
+        elif isinstance(model, models.EncodecModel):
+            state = model.state_dict()
+        else:
+            raise RuntimeError(f&#34;Cannot fine tune model type {type(model)}.&#34;)
+        return {
+            &#39;best_state&#39;: {&#39;model&#39;: state}
+        }
+
+    @staticmethod
+    def model_from_checkpoint(checkpoint_path: tp.Union[Path, str],
+                              device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+        &#34;&#34;&#34;Instantiate a CompressionModel from a given checkpoint path or dora sig.
+        This method is a convenient endpoint to load a CompressionModel to use in other solvers.
+
+        Args:
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+                This also supports pre-trained models by using a path of the form //pretrained/NAME.
+                See `model_from_pretrained` for a list of supported pretrained models.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        &#34;&#34;&#34;
+        checkpoint_path = str(checkpoint_path)
+        if checkpoint_path.startswith(&#39;//pretrained/&#39;):
+            name = checkpoint_path.split(&#39;/&#39;, 3)[-1]
+            return models.CompressionModel.get_pretrained(name, device)
+        logger = logging.getLogger(__name__)
+        logger.info(f&#34;Loading compression model from checkpoint: {checkpoint_path}&#34;)
+        _checkpoint_path = checkpoint.resolve_checkpoint_path(checkpoint_path, use_fsdp=False)
+        assert _checkpoint_path is not None, f&#34;Could not resolve compression model checkpoint path: {checkpoint_path}&#34;
+        state = checkpoint.load_checkpoint(_checkpoint_path)
+        assert state is not None and &#39;xp.cfg&#39; in state, f&#34;Could not load compression model from ckpt: {checkpoint_path}&#34;
+        cfg = state[&#39;xp.cfg&#39;]
+        cfg.device = device
+        compression_model = models.builders.get_compression_model(cfg).to(device)
+        assert compression_model.sample_rate == cfg.sample_rate, &#34;Compression model sample rate should match&#34;
+
+        assert &#39;best_state&#39; in state and state[&#39;best_state&#39;] != {}
+        assert &#39;exported&#39; not in state, &#34;When loading an exported checkpoint, use the //pretrained/ prefix.&#34;
+        compression_model.load_state_dict(state[&#39;best_state&#39;][&#39;model&#39;])
+        compression_model.eval()
+        logger.info(&#34;Compression model loaded!&#34;)
+        return compression_model
+
+    @staticmethod
+    def wrapped_model_from_checkpoint(cfg: omegaconf.DictConfig,
+                                      checkpoint_path: tp.Union[Path, str],
+                                      device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+        &#34;&#34;&#34;Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.
+
+        Args:
+            cfg (omegaconf.DictConfig): Configuration to read from for wrapped mode.
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        &#34;&#34;&#34;
+        compression_model = CompressionSolver.model_from_checkpoint(checkpoint_path, device)
+        compression_model = models.builders.get_wrapped_compression_model(compression_model, cfg)
+        return compression_model</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></li>
+<li>abc.ABC</li>
+<li>flashy.solver.BaseSolver</li>
+</ul>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.solvers.compression.CompressionSolver.model_from_checkpoint"><code class="name flex">
+<span>def <span class="ident">model_from_checkpoint</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], device: Union[torch.device, str] = 'cpu') ‑> <a title="audiocraft.models.encodec.CompressionModel" href="../models/encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a CompressionModel from a given checkpoint path or dora sig.
+This method is a convenient endpoint to load a CompressionModel to use in other solvers.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>checkpoint_path</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>Path to checkpoint or dora sig from where the checkpoint is resolved.
+This also supports pre-trained models by using a path of the form //pretrained/NAME.
+See <code>model_from_pretrained</code> for a list of supported pretrained models.</dd>
+<dt><strong><code>use_ema</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use EMA variant of the model instead of the actual model.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code></dt>
+<dd>Device on which the model is loaded.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def model_from_checkpoint(checkpoint_path: tp.Union[Path, str],
+                          device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+    &#34;&#34;&#34;Instantiate a CompressionModel from a given checkpoint path or dora sig.
+    This method is a convenient endpoint to load a CompressionModel to use in other solvers.
+
+    Args:
+        checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+            This also supports pre-trained models by using a path of the form //pretrained/NAME.
+            See `model_from_pretrained` for a list of supported pretrained models.
+        use_ema (bool): Use EMA variant of the model instead of the actual model.
+        device (torch.device or str): Device on which the model is loaded.
+    &#34;&#34;&#34;
+    checkpoint_path = str(checkpoint_path)
+    if checkpoint_path.startswith(&#39;//pretrained/&#39;):
+        name = checkpoint_path.split(&#39;/&#39;, 3)[-1]
+        return models.CompressionModel.get_pretrained(name, device)
+    logger = logging.getLogger(__name__)
+    logger.info(f&#34;Loading compression model from checkpoint: {checkpoint_path}&#34;)
+    _checkpoint_path = checkpoint.resolve_checkpoint_path(checkpoint_path, use_fsdp=False)
+    assert _checkpoint_path is not None, f&#34;Could not resolve compression model checkpoint path: {checkpoint_path}&#34;
+    state = checkpoint.load_checkpoint(_checkpoint_path)
+    assert state is not None and &#39;xp.cfg&#39; in state, f&#34;Could not load compression model from ckpt: {checkpoint_path}&#34;
+    cfg = state[&#39;xp.cfg&#39;]
+    cfg.device = device
+    compression_model = models.builders.get_compression_model(cfg).to(device)
+    assert compression_model.sample_rate == cfg.sample_rate, &#34;Compression model sample rate should match&#34;
+
+    assert &#39;best_state&#39; in state and state[&#39;best_state&#39;] != {}
+    assert &#39;exported&#39; not in state, &#34;When loading an exported checkpoint, use the //pretrained/ prefix.&#34;
+    compression_model.load_state_dict(state[&#39;best_state&#39;][&#39;model&#39;])
+    compression_model.eval()
+    logger.info(&#34;Compression model loaded!&#34;)
+    return compression_model</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.compression.CompressionSolver.wrapped_model_from_checkpoint"><code class="name flex">
+<span>def <span class="ident">wrapped_model_from_checkpoint</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig, checkpoint_path: Union[str, pathlib.Path], device: Union[torch.device, str] = 'cpu') ‑> <a title="audiocraft.models.encodec.CompressionModel" href="../models/encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>omegaconf.DictConfig</code></dt>
+<dd>Configuration to read from for wrapped mode.</dd>
+<dt><strong><code>checkpoint_path</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>Path to checkpoint or dora sig from where the checkpoint is resolved.</dd>
+<dt><strong><code>use_ema</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use EMA variant of the model instead of the actual model.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code></dt>
+<dd>Device on which the model is loaded.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def wrapped_model_from_checkpoint(cfg: omegaconf.DictConfig,
+                                  checkpoint_path: tp.Union[Path, str],
+                                  device: tp.Union[torch.device, str] = &#39;cpu&#39;) -&gt; models.CompressionModel:
+    &#34;&#34;&#34;Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.
+
+    Args:
+        cfg (omegaconf.DictConfig): Configuration to read from for wrapped mode.
+        checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+        use_ema (bool): Use EMA variant of the model instead of the actual model.
+        device (torch.device or str): Device on which the model is loaded.
+    &#34;&#34;&#34;
+    compression_model = CompressionSolver.model_from_checkpoint(checkpoint_path, device)
+    compression_model = models.builders.get_wrapped_compression_model(compression_model, cfg)
+    return compression_model</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.solvers.compression.CompressionSolver.build_dataloaders"><code class="name flex">
+<span>def <span class="ident">build_dataloaders</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate audio dataloaders for each stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_dataloaders(self):
+    &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+    self.dataloaders = builders.get_audio_datasets(self.cfg)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.compression.CompressionSolver.build_model"><code class="name flex">
+<span>def <span class="ident">build_model</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate model and optimizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_model(self):
+    &#34;&#34;&#34;Instantiate model and optimizer.&#34;&#34;&#34;
+    # Model and optimizer
+    self.model = models.builders.get_compression_model(self.cfg).to(self.device)
+    self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+    self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+    self.register_best_state(&#39;model&#39;)
+    self.register_ema(&#39;model&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.compression.CompressionSolver.evaluate"><code class="name flex">
+<span>def <span class="ident">evaluate</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Evaluate stage. Runs audio reconstruction evaluation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def evaluate(self):
+    &#34;&#34;&#34;Evaluate stage. Runs audio reconstruction evaluation.&#34;&#34;&#34;
+    self.model.eval()
+    evaluate_stage_name = str(self.current_stage)
+
+    loader = self.dataloaders[&#39;evaluate&#39;]
+    updates = len(loader)
+    lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+    average = flashy.averager()
+
+    pendings = []
+    ctx = multiprocessing.get_context(&#39;spawn&#39;)
+    with get_pool_executor(self.cfg.evaluate.num_workers, mp_context=ctx) as pool:
+        for idx, batch in enumerate(lp):
+            x = batch.to(self.device)
+            with torch.no_grad():
+                qres = self.model(x)
+
+            y_pred = qres.x.cpu()
+            y = batch.cpu()  # should already be on CPU but just in case
+            pendings.append(pool.submit(evaluate_audio_reconstruction, y_pred, y, self.cfg))
+
+        metrics_lp = self.log_progress(f&#39;{evaluate_stage_name} metrics&#39;, pendings, updates=self.log_updates)
+        for pending in metrics_lp:
+            metrics = pending.result()
+            metrics = average(metrics)
+
+    metrics = flashy.distrib.average_metrics(metrics, len(loader))
+    return metrics</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.compression.CompressionSolver.load_from_pretrained"><code class="name flex">
+<span>def <span class="ident">load_from_pretrained</span></span>(<span>self, name: str) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_from_pretrained(self, name: str) -&gt; dict:
+    model = models.CompressionModel.get_pretrained(name)
+    if isinstance(model, models.DAC):
+        raise RuntimeError(&#34;Cannot fine tune a DAC model.&#34;)
+    elif isinstance(model, models.HFEncodecCompressionModel):
+        self.logger.warning(&#39;Trying to automatically convert a HuggingFace model &#39;
+                            &#39;to AudioCraft, this might fail!&#39;)
+        state = model.model.state_dict()
+        new_state = {}
+        for k, v in state.items():
+            if k.startswith(&#39;decoder.layers&#39;) and &#39;.conv.&#39; in k and &#39;.block.&#39; not in k:
+                # We need to determine if this a convtr or a regular conv.
+                layer = int(k.split(&#39;.&#39;)[2])
+                if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):
+
+                    k = k.replace(&#39;.conv.&#39;, &#39;.convtr.&#39;)
+            k = k.replace(&#39;encoder.layers.&#39;, &#39;encoder.model.&#39;)
+            k = k.replace(&#39;decoder.layers.&#39;, &#39;decoder.model.&#39;)
+            k = k.replace(&#39;conv.&#39;, &#39;conv.conv.&#39;)
+            k = k.replace(&#39;convtr.&#39;, &#39;convtr.convtr.&#39;)
+            k = k.replace(&#39;quantizer.layers.&#39;, &#39;quantizer.vq.layers.&#39;)
+            k = k.replace(&#39;.codebook.&#39;, &#39;._codebook.&#39;)
+            new_state[k] = v
+        state = new_state
+    elif isinstance(model, models.EncodecModel):
+        state = model.state_dict()
+    else:
+        raise RuntimeError(f&#34;Cannot fine tune model type {type(model)}.&#34;)
+    return {
+        &#39;best_state&#39;: {&#39;model&#39;: state}
+    }</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.compression.CompressionSolver.show"><code class="name flex">
+<span>def <span class="ident">show</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Show the compression model and employed adversarial loss.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def show(self):
+    &#34;&#34;&#34;Show the compression model and employed adversarial loss.&#34;&#34;&#34;
+    self.logger.info(f&#34;Compression model with {self.model.quantizer.total_codebooks} codebooks:&#34;)
+    self.log_model_summary(self.model)
+    self.logger.info(&#34;Adversarial loss:&#34;)
+    self.log_model_summary(self.adv_losses)
+    self.logger.info(&#34;Auxiliary losses:&#34;)
+    self.logger.info(self.aux_losses)
+    self.logger.info(&#34;Info losses:&#34;)
+    self.logger.info(self.info_losses)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.solvers.base.StandardSolver.autocast" href="base.html#audiocraft.solvers.base.StandardSolver.autocast">autocast</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.best_metric_name" href="base.html#audiocraft.solvers.base.StandardSolver.best_metric_name">best_metric_name</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.commit" href="base.html#audiocraft.solvers.base.StandardSolver.commit">commit</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.common_train_valid" href="base.html#audiocraft.solvers.base.StandardSolver.common_train_valid">common_train_valid</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.generate" href="base.html#audiocraft.solvers.base.StandardSolver.generate">generate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig" href="base.html#audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig">get_eval_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.initialize_ema" href="base.html#audiocraft.solvers.base.StandardSolver.initialize_ema">initialize_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.load_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.load_checkpoints">load_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.log_model_summary" href="base.html#audiocraft.solvers.base.StandardSolver.log_model_summary">log_model_summary</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_best_state" href="base.html#audiocraft.solvers.base.StandardSolver.register_best_state">register_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_ema" href="base.html#audiocraft.solvers.base.StandardSolver.register_ema">register_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.restore" href="base.html#audiocraft.solvers.base.StandardSolver.restore">restore</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run" href="base.html#audiocraft.solvers.base.StandardSolver.run">run</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_epoch" href="base.html#audiocraft.solvers.base.StandardSolver.run_epoch">run_epoch</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_one_stage" href="base.html#audiocraft.solvers.base.StandardSolver.run_one_stage">run_one_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_step" href="base.html#audiocraft.solvers.base.StandardSolver.run_step">run_step</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.save_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.save_checkpoints">save_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_run_stage" href="base.html#audiocraft.solvers.base.StandardSolver.should_run_stage">should_run_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_stop_training" href="base.html#audiocraft.solvers.base.StandardSolver.should_stop_training">should_stop_training</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.train" href="base.html#audiocraft.solvers.base.StandardSolver.train">train</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.update_best_state_from_stage" href="base.html#audiocraft.solvers.base.StandardSolver.update_best_state_from_stage">update_best_state_from_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.valid" href="base.html#audiocraft.solvers.base.StandardSolver.valid">valid</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.solvers.compression.evaluate_audio_reconstruction" href="#audiocraft.solvers.compression.evaluate_audio_reconstruction">evaluate_audio_reconstruction</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.compression.CompressionSolver" href="#audiocraft.solvers.compression.CompressionSolver">CompressionSolver</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.build_dataloaders" href="#audiocraft.solvers.compression.CompressionSolver.build_dataloaders">build_dataloaders</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.build_model" href="#audiocraft.solvers.compression.CompressionSolver.build_model">build_model</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.evaluate" href="#audiocraft.solvers.compression.CompressionSolver.evaluate">evaluate</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.load_from_pretrained" href="#audiocraft.solvers.compression.CompressionSolver.load_from_pretrained">load_from_pretrained</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.model_from_checkpoint" href="#audiocraft.solvers.compression.CompressionSolver.model_from_checkpoint">model_from_checkpoint</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.show" href="#audiocraft.solvers.compression.CompressionSolver.show">show</a></code></li>
+<li><code><a title="audiocraft.solvers.compression.CompressionSolver.wrapped_model_from_checkpoint" href="#audiocraft.solvers.compression.CompressionSolver.wrapped_model_from_checkpoint">wrapped_model_from_checkpoint</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/diffusion.html b/api_docs/audiocraft/solvers/diffusion.html
new file mode 100644
index 00000000..6fc7dc6b
--- /dev/null
+++ b/api_docs/audiocraft/solvers/diffusion.html
@@ -0,0 +1,887 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.diffusion API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.diffusion</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import flashy
+import julius
+import omegaconf
+import torch
+import torch.nn.functional as F
+
+from . import builders
+from . import base
+from .. import models
+from ..modules.diffusion_schedule import NoiseSchedule
+from ..metrics import RelativeVolumeMel
+from ..models.builders import get_processor
+from ..utils.samples.manager import SampleManager
+from ..solvers.compression import CompressionSolver
+
+
+class PerStageMetrics:
+    &#34;&#34;&#34;Handle prompting the metrics per stage.
+    It outputs the metrics per range of diffusion states.
+    e.g. avg loss when t in [250, 500]
+    &#34;&#34;&#34;
+    def __init__(self, num_steps: int, num_stages: int = 4):
+        self.num_steps = num_steps
+        self.num_stages = num_stages
+
+    def __call__(self, losses: dict, step: tp.Union[int, torch.Tensor]):
+        if type(step) is int:
+            stage = int((step / self.num_steps) * self.num_stages)
+            return {f&#34;{name}_{stage}&#34;: loss for name, loss in losses.items()}
+        elif type(step) is torch.Tensor:
+            stage_tensor = ((step / self.num_steps) * self.num_stages).long()
+            out: tp.Dict[str, float] = {}
+            for stage_idx in range(self.num_stages):
+                mask = (stage_tensor == stage_idx)
+                N = mask.sum()
+                stage_out = {}
+                if N &gt; 0:  # pass if no elements in the stage
+                    for name, loss in losses.items():
+                        stage_loss = (mask * loss).sum() / N
+                        stage_out[f&#34;{name}_{stage_idx}&#34;] = stage_loss
+                out = {**out, **stage_out}
+            return out
+
+
+class DataProcess:
+    &#34;&#34;&#34;Apply filtering or resampling.
+
+    Args:
+        initial_sr (int): Initial sample rate.
+        target_sr (int): Target sample rate.
+        use_resampling: Whether to use resampling or not.
+        use_filter (bool):
+        n_bands (int): Number of bands to consider.
+        idx_band (int):
+        device (torch.device or str):
+        cutoffs ():
+        boost (bool):
+    &#34;&#34;&#34;
+    def __init__(self, initial_sr: int = 24000, target_sr: int = 16000, use_resampling: bool = False,
+                 use_filter: bool = False, n_bands: int = 4,
+                 idx_band: int = 0, device: torch.device = torch.device(&#39;cpu&#39;), cutoffs=None, boost=False):
+        &#34;&#34;&#34;Apply filtering or resampling
+        Args:
+            initial_sr (int): sample rate of the dataset
+            target_sr (int): sample rate after resampling
+            use_resampling (bool): whether or not performs resampling
+            use_filter (bool): when True filter the data to keep only one frequency band
+            n_bands (int): Number of bands used
+            cuts (none or list): The cutoff frequencies of the band filtering
+                                if None then we use mel scale bands.
+            idx_band (int): index of the frequency band. 0 are lows ... (n_bands - 1) highs
+            boost (bool): make the data scale match our music dataset.
+        &#34;&#34;&#34;
+        assert idx_band &lt; n_bands
+        self.idx_band = idx_band
+        if use_filter:
+            if cutoffs is not None:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, cutoffs=cutoffs).to(device)
+            else:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, n_bands=n_bands).to(device)
+        self.use_filter = use_filter
+        self.use_resampling = use_resampling
+        self.target_sr = target_sr
+        self.initial_sr = initial_sr
+        self.boost = boost
+
+    def process_data(self, x, metric=False):
+        if x is None:
+            return None
+        if self.boost:
+            x /= torch.clamp(x.std(dim=(1, 2), keepdim=True), min=1e-4)
+            x * 0.22
+        if self.use_filter and not metric:
+            x = self.filter(x)[self.idx_band]
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.initial_sr, new_sr=self.target_sr)
+        return x
+
+    def inverse_process(self, x):
+        &#34;&#34;&#34;Upsampling only.&#34;&#34;&#34;
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.target_sr, new_sr=self.target_sr)
+        return x
+
+
+class DiffusionSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for compression task.
+
+    The diffusion task allows for MultiBand diffusion model training.
+
+    Args:
+        cfg (DictConfig): Configuration.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+        self.device = cfg.device
+        self.sample_rate: int = self.cfg.sample_rate
+        self.codec_model = CompressionSolver.model_from_checkpoint(
+            cfg.compression_model_checkpoint, device=self.device)
+
+        self.codec_model.set_num_codebooks(cfg.n_q)
+        assert self.codec_model.sample_rate == self.cfg.sample_rate, (
+            f&#34;Codec model sample rate is {self.codec_model.sample_rate} but &#34;
+            f&#34;Solver sample rate is {self.cfg.sample_rate}.&#34;
+            )
+        assert self.codec_model.sample_rate == self.sample_rate, \
+            f&#34;Sample rate of solver {self.sample_rate} and codec {self.codec_model.sample_rate} &#34; \
+            &#34;don&#39;t match.&#34;
+
+        self.sample_processor = get_processor(cfg.processor, sample_rate=self.sample_rate)
+        self.register_stateful(&#39;sample_processor&#39;)
+        self.sample_processor.to(self.device)
+
+        self.schedule = NoiseSchedule(
+            **cfg.schedule, device=self.device, sample_processor=self.sample_processor)
+
+        self.eval_metric: tp.Optional[torch.nn.Module] = None
+
+        self.rvm = RelativeVolumeMel()
+        self.data_processor = DataProcess(initial_sr=self.sample_rate, target_sr=cfg.resampling.target_sr,
+                                          use_resampling=cfg.resampling.use, cutoffs=cfg.filter.cutoffs,
+                                          use_filter=cfg.filter.use, n_bands=cfg.filter.n_bands,
+                                          idx_band=cfg.filter.idx_band, device=self.device)
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        if self._current_stage == &#34;evaluate&#34;:
+            return &#39;rvm&#39;
+        else:
+            return &#39;loss&#39;
+
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, &#34;Scaled compression models not supported.&#34;
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+
+    def build_model(self):
+        &#34;&#34;&#34;Build model and optimizer as well as optional Exponential Moving Average of the model.
+        &#34;&#34;&#34;
+        # Model and optimizer
+        self.model = models.builders.get_diffusion_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.register_ema(&#39;model&#39;)
+
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Build audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+
+    def show(self):
+        # TODO
+        raise NotImplementedError()
+
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        x = batch.to(self.device)
+        loss_fun = F.mse_loss if self.cfg.loss.kind == &#39;mse&#39; else F.l1_loss
+
+        condition = self.get_condition(x)  # [bs, 128, T/hop, n_emb]
+        sample = self.data_processor.process_data(x)
+
+        input_, target, step = self.schedule.get_training_item(sample,
+                                                               tensor_step=self.cfg.schedule.variable_step_batch)
+        out = self.model(input_, step, condition=condition).sample
+
+        base_loss = loss_fun(out, target, reduction=&#39;none&#39;).mean(dim=(1, 2))
+        reference_loss = loss_fun(input_, target, reduction=&#39;none&#39;).mean(dim=(1, 2))
+        loss = base_loss / reference_loss ** self.cfg.loss.norm_power
+
+        if self.is_training:
+            loss.mean().backward()
+            flashy.distrib.sync_model(self.model)
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        metrics = {
+            &#39;loss&#39;: loss.mean(), &#39;normed_loss&#39;: (base_loss / reference_loss).mean(),
+            }
+        metrics.update(self.per_stage({&#39;loss&#39;: loss, &#39;normed_loss&#39;: base_loss / reference_loss}, step))
+        metrics.update({
+            &#39;std_in&#39;: input_.std(), &#39;std_out&#39;: out.std()})
+        return metrics
+
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        self.per_stage = PerStageMetrics(self.schedule.num_steps, self.cfg.metrics.num_stage)
+        # run epoch
+        super().run_epoch()
+
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage.
+        Runs audio reconstruction evaluation.
+        &#34;&#34;&#34;
+        self.model.eval()
+        evaluate_stage_name = f&#39;{self.current_stage}&#39;
+        loader = self.dataloaders[&#39;evaluate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(f&#39;{evaluate_stage_name} estimate&#39;, loader, total=updates, updates=self.log_updates)
+
+        metrics = {}
+        n = 1
+        for idx, batch in enumerate(lp):
+            x = batch.to(self.device)
+            with torch.no_grad():
+                y_pred = self.regenerate(x)
+
+            y_pred = y_pred.cpu()
+            y = batch.cpu()  # should already be on CPU but just in case
+            rvm = self.rvm(y_pred, y)
+            lp.update(**rvm)
+            if len(metrics) == 0:
+                metrics = rvm
+            else:
+                for key in rvm.keys():
+                    metrics[key] = (metrics[key] * n + rvm[key]) / (n + 1)
+        metrics = flashy.distrib.average_metrics(metrics)
+        return metrics
+
+    @torch.no_grad()
+    def regenerate(self, wav: torch.Tensor, step_list: tp.Optional[list] = None):
+        &#34;&#34;&#34;Regenerate the given waveform.&#34;&#34;&#34;
+        condition = self.get_condition(wav)
+        initial = self.schedule.get_initial_noise(self.data_processor.process_data(wav))  # sampling rate changes.
+        result = self.schedule.generate_subsampled(self.model, initial=initial, condition=condition,
+                                                   step_list=step_list)
+        result = self.data_processor.inverse_process(result)
+        return result
+
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        sample_manager = SampleManager(self.xp)
+        self.model.eval()
+        generate_stage_name = f&#39;{self.current_stage}&#39;
+
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            estimate = self.regenerate(reference)
+            reference = reference.cpu()
+            estimate = estimate.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+        flashy.distrib.barrier()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.diffusion.DataProcess"><code class="flex name class">
+<span>class <span class="ident">DataProcess</span></span>
+<span>(</span><span>initial_sr: int = 24000, target_sr: int = 16000, use_resampling: bool = False, use_filter: bool = False, n_bands: int = 4, idx_band: int = 0, device: torch.device = device(type='cpu'), cutoffs=None, boost=False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply filtering or resampling.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>initial_sr</code></strong> :&ensp;<code>int</code></dt>
+<dd>Initial sample rate.</dd>
+<dt><strong><code>target_sr</code></strong> :&ensp;<code>int</code></dt>
+<dd>Target sample rate.</dd>
+<dt><strong><code>use_resampling</code></strong></dt>
+<dd>Whether to use resampling or not.</dd>
+<dt>use_filter (bool):</dt>
+<dt><strong><code>n_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of bands to consider.</dd>
+</dl>
+<p>idx_band (int):
+device (torch.device or str):
+cutoffs ():
+boost (bool):
+Apply filtering or resampling</p>
+<h2 id="args_1">Args</h2>
+<dl>
+<dt><strong><code>initial_sr</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate of the dataset</dd>
+<dt><strong><code>target_sr</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate after resampling</dd>
+<dt><strong><code>use_resampling</code></strong> :&ensp;<code>bool</code></dt>
+<dd>whether or not performs resampling</dd>
+<dt><strong><code>use_filter</code></strong> :&ensp;<code>bool</code></dt>
+<dd>when True filter the data to keep only one frequency band</dd>
+<dt><strong><code>n_bands</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of bands used</dd>
+<dt><strong><code>cuts</code></strong> :&ensp;<code>none</code> or <code>list</code></dt>
+<dd>The cutoff frequencies of the band filtering
+if None then we use mel scale bands.</dd>
+<dt><strong><code>idx_band</code></strong> :&ensp;<code>int</code></dt>
+<dd>index of the frequency band. 0 are lows &hellip; (n_bands - 1) highs</dd>
+<dt><strong><code>boost</code></strong> :&ensp;<code>bool</code></dt>
+<dd>make the data scale match our music dataset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DataProcess:
+    &#34;&#34;&#34;Apply filtering or resampling.
+
+    Args:
+        initial_sr (int): Initial sample rate.
+        target_sr (int): Target sample rate.
+        use_resampling: Whether to use resampling or not.
+        use_filter (bool):
+        n_bands (int): Number of bands to consider.
+        idx_band (int):
+        device (torch.device or str):
+        cutoffs ():
+        boost (bool):
+    &#34;&#34;&#34;
+    def __init__(self, initial_sr: int = 24000, target_sr: int = 16000, use_resampling: bool = False,
+                 use_filter: bool = False, n_bands: int = 4,
+                 idx_band: int = 0, device: torch.device = torch.device(&#39;cpu&#39;), cutoffs=None, boost=False):
+        &#34;&#34;&#34;Apply filtering or resampling
+        Args:
+            initial_sr (int): sample rate of the dataset
+            target_sr (int): sample rate after resampling
+            use_resampling (bool): whether or not performs resampling
+            use_filter (bool): when True filter the data to keep only one frequency band
+            n_bands (int): Number of bands used
+            cuts (none or list): The cutoff frequencies of the band filtering
+                                if None then we use mel scale bands.
+            idx_band (int): index of the frequency band. 0 are lows ... (n_bands - 1) highs
+            boost (bool): make the data scale match our music dataset.
+        &#34;&#34;&#34;
+        assert idx_band &lt; n_bands
+        self.idx_band = idx_band
+        if use_filter:
+            if cutoffs is not None:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, cutoffs=cutoffs).to(device)
+            else:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, n_bands=n_bands).to(device)
+        self.use_filter = use_filter
+        self.use_resampling = use_resampling
+        self.target_sr = target_sr
+        self.initial_sr = initial_sr
+        self.boost = boost
+
+    def process_data(self, x, metric=False):
+        if x is None:
+            return None
+        if self.boost:
+            x /= torch.clamp(x.std(dim=(1, 2), keepdim=True), min=1e-4)
+            x * 0.22
+        if self.use_filter and not metric:
+            x = self.filter(x)[self.idx_band]
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.initial_sr, new_sr=self.target_sr)
+        return x
+
+    def inverse_process(self, x):
+        &#34;&#34;&#34;Upsampling only.&#34;&#34;&#34;
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.target_sr, new_sr=self.target_sr)
+        return x</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.solvers.diffusion.DataProcess.inverse_process"><code class="name flex">
+<span>def <span class="ident">inverse_process</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Upsampling only.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def inverse_process(self, x):
+    &#34;&#34;&#34;Upsampling only.&#34;&#34;&#34;
+    if self.use_resampling:
+        x = julius.resample_frac(x, old_sr=self.target_sr, new_sr=self.target_sr)
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DataProcess.process_data"><code class="name flex">
+<span>def <span class="ident">process_data</span></span>(<span>self, x, metric=False)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def process_data(self, x, metric=False):
+    if x is None:
+        return None
+    if self.boost:
+        x /= torch.clamp(x.std(dim=(1, 2), keepdim=True), min=1e-4)
+        x * 0.22
+    if self.use_filter and not metric:
+        x = self.filter(x)[self.idx_band]
+    if self.use_resampling:
+        x = julius.resample_frac(x, old_sr=self.initial_sr, new_sr=self.target_sr)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver"><code class="flex name class">
+<span>class <span class="ident">DiffusionSolver</span></span>
+<span>(</span><span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Solver for compression task.</p>
+<p>The diffusion task allows for MultiBand diffusion model training.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>DictConfig</code></dt>
+<dd>Configuration.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DiffusionSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for compression task.
+
+    The diffusion task allows for MultiBand diffusion model training.
+
+    Args:
+        cfg (DictConfig): Configuration.
+    &#34;&#34;&#34;
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+        self.device = cfg.device
+        self.sample_rate: int = self.cfg.sample_rate
+        self.codec_model = CompressionSolver.model_from_checkpoint(
+            cfg.compression_model_checkpoint, device=self.device)
+
+        self.codec_model.set_num_codebooks(cfg.n_q)
+        assert self.codec_model.sample_rate == self.cfg.sample_rate, (
+            f&#34;Codec model sample rate is {self.codec_model.sample_rate} but &#34;
+            f&#34;Solver sample rate is {self.cfg.sample_rate}.&#34;
+            )
+        assert self.codec_model.sample_rate == self.sample_rate, \
+            f&#34;Sample rate of solver {self.sample_rate} and codec {self.codec_model.sample_rate} &#34; \
+            &#34;don&#39;t match.&#34;
+
+        self.sample_processor = get_processor(cfg.processor, sample_rate=self.sample_rate)
+        self.register_stateful(&#39;sample_processor&#39;)
+        self.sample_processor.to(self.device)
+
+        self.schedule = NoiseSchedule(
+            **cfg.schedule, device=self.device, sample_processor=self.sample_processor)
+
+        self.eval_metric: tp.Optional[torch.nn.Module] = None
+
+        self.rvm = RelativeVolumeMel()
+        self.data_processor = DataProcess(initial_sr=self.sample_rate, target_sr=cfg.resampling.target_sr,
+                                          use_resampling=cfg.resampling.use, cutoffs=cfg.filter.cutoffs,
+                                          use_filter=cfg.filter.use, n_bands=cfg.filter.n_bands,
+                                          idx_band=cfg.filter.idx_band, device=self.device)
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        if self._current_stage == &#34;evaluate&#34;:
+            return &#39;rvm&#39;
+        else:
+            return &#39;loss&#39;
+
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor) -&gt; torch.Tensor:
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, &#34;Scaled compression models not supported.&#34;
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+
+    def build_model(self):
+        &#34;&#34;&#34;Build model and optimizer as well as optional Exponential Moving Average of the model.
+        &#34;&#34;&#34;
+        # Model and optimizer
+        self.model = models.builders.get_diffusion_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.register_ema(&#39;model&#39;)
+
+    def build_dataloaders(self):
+        &#34;&#34;&#34;Build audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+
+    def show(self):
+        # TODO
+        raise NotImplementedError()
+
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        x = batch.to(self.device)
+        loss_fun = F.mse_loss if self.cfg.loss.kind == &#39;mse&#39; else F.l1_loss
+
+        condition = self.get_condition(x)  # [bs, 128, T/hop, n_emb]
+        sample = self.data_processor.process_data(x)
+
+        input_, target, step = self.schedule.get_training_item(sample,
+                                                               tensor_step=self.cfg.schedule.variable_step_batch)
+        out = self.model(input_, step, condition=condition).sample
+
+        base_loss = loss_fun(out, target, reduction=&#39;none&#39;).mean(dim=(1, 2))
+        reference_loss = loss_fun(input_, target, reduction=&#39;none&#39;).mean(dim=(1, 2))
+        loss = base_loss / reference_loss ** self.cfg.loss.norm_power
+
+        if self.is_training:
+            loss.mean().backward()
+            flashy.distrib.sync_model(self.model)
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        metrics = {
+            &#39;loss&#39;: loss.mean(), &#39;normed_loss&#39;: (base_loss / reference_loss).mean(),
+            }
+        metrics.update(self.per_stage({&#39;loss&#39;: loss, &#39;normed_loss&#39;: base_loss / reference_loss}, step))
+        metrics.update({
+            &#39;std_in&#39;: input_.std(), &#39;std_out&#39;: out.std()})
+        return metrics
+
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        self.per_stage = PerStageMetrics(self.schedule.num_steps, self.cfg.metrics.num_stage)
+        # run epoch
+        super().run_epoch()
+
+    def evaluate(self):
+        &#34;&#34;&#34;Evaluate stage.
+        Runs audio reconstruction evaluation.
+        &#34;&#34;&#34;
+        self.model.eval()
+        evaluate_stage_name = f&#39;{self.current_stage}&#39;
+        loader = self.dataloaders[&#39;evaluate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(f&#39;{evaluate_stage_name} estimate&#39;, loader, total=updates, updates=self.log_updates)
+
+        metrics = {}
+        n = 1
+        for idx, batch in enumerate(lp):
+            x = batch.to(self.device)
+            with torch.no_grad():
+                y_pred = self.regenerate(x)
+
+            y_pred = y_pred.cpu()
+            y = batch.cpu()  # should already be on CPU but just in case
+            rvm = self.rvm(y_pred, y)
+            lp.update(**rvm)
+            if len(metrics) == 0:
+                metrics = rvm
+            else:
+                for key in rvm.keys():
+                    metrics[key] = (metrics[key] * n + rvm[key]) / (n + 1)
+        metrics = flashy.distrib.average_metrics(metrics)
+        return metrics
+
+    @torch.no_grad()
+    def regenerate(self, wav: torch.Tensor, step_list: tp.Optional[list] = None):
+        &#34;&#34;&#34;Regenerate the given waveform.&#34;&#34;&#34;
+        condition = self.get_condition(wav)
+        initial = self.schedule.get_initial_noise(self.data_processor.process_data(wav))  # sampling rate changes.
+        result = self.schedule.generate_subsampled(self.model, initial=initial, condition=condition,
+                                                   step_list=step_list)
+        result = self.data_processor.inverse_process(result)
+        return result
+
+    def generate(self):
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        sample_manager = SampleManager(self.xp)
+        self.model.eval()
+        generate_stage_name = f&#39;{self.current_stage}&#39;
+
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            estimate = self.regenerate(reference)
+            reference = reference.cpu()
+            estimate = estimate.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+        flashy.distrib.barrier()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></li>
+<li>abc.ABC</li>
+<li>flashy.solver.BaseSolver</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver.build_dataloaders"><code class="name flex">
+<span>def <span class="ident">build_dataloaders</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build audio dataloaders for each stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_dataloaders(self):
+    &#34;&#34;&#34;Build audio dataloaders for each stage.&#34;&#34;&#34;
+    self.dataloaders = builders.get_audio_datasets(self.cfg)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver.build_model"><code class="name flex">
+<span>def <span class="ident">build_model</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build model and optimizer as well as optional Exponential Moving Average of the model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_model(self):
+    &#34;&#34;&#34;Build model and optimizer as well as optional Exponential Moving Average of the model.
+    &#34;&#34;&#34;
+    # Model and optimizer
+    self.model = models.builders.get_diffusion_model(self.cfg).to(self.device)
+    self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+    self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;)
+    self.register_best_state(&#39;model&#39;)
+    self.register_ema(&#39;model&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver.evaluate"><code class="name flex">
+<span>def <span class="ident">evaluate</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Evaluate stage.
+Runs audio reconstruction evaluation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def evaluate(self):
+    &#34;&#34;&#34;Evaluate stage.
+    Runs audio reconstruction evaluation.
+    &#34;&#34;&#34;
+    self.model.eval()
+    evaluate_stage_name = f&#39;{self.current_stage}&#39;
+    loader = self.dataloaders[&#39;evaluate&#39;]
+    updates = len(loader)
+    lp = self.log_progress(f&#39;{evaluate_stage_name} estimate&#39;, loader, total=updates, updates=self.log_updates)
+
+    metrics = {}
+    n = 1
+    for idx, batch in enumerate(lp):
+        x = batch.to(self.device)
+        with torch.no_grad():
+            y_pred = self.regenerate(x)
+
+        y_pred = y_pred.cpu()
+        y = batch.cpu()  # should already be on CPU but just in case
+        rvm = self.rvm(y_pred, y)
+        lp.update(**rvm)
+        if len(metrics) == 0:
+            metrics = rvm
+        else:
+            for key in rvm.keys():
+                metrics[key] = (metrics[key] * n + rvm[key]) / (n + 1)
+    metrics = flashy.distrib.average_metrics(metrics)
+    return metrics</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver.get_condition"><code class="name flex">
+<span>def <span class="ident">get_condition</span></span>(<span>self, wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def get_condition(self, wav: torch.Tensor) -&gt; torch.Tensor:
+    codes, scale = self.codec_model.encode(wav)
+    assert scale is None, &#34;Scaled compression models not supported.&#34;
+    emb = self.codec_model.decode_latent(codes)
+    return emb</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.diffusion.DiffusionSolver.regenerate"><code class="name flex">
+<span>def <span class="ident">regenerate</span></span>(<span>self, wav: torch.Tensor, step_list: Optional[list] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Regenerate the given waveform.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def regenerate(self, wav: torch.Tensor, step_list: tp.Optional[list] = None):
+    &#34;&#34;&#34;Regenerate the given waveform.&#34;&#34;&#34;
+    condition = self.get_condition(wav)
+    initial = self.schedule.get_initial_noise(self.data_processor.process_data(wav))  # sampling rate changes.
+    result = self.schedule.generate_subsampled(self.model, initial=initial, condition=condition,
+                                               step_list=step_list)
+    result = self.data_processor.inverse_process(result)
+    return result</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.solvers.base.StandardSolver.autocast" href="base.html#audiocraft.solvers.base.StandardSolver.autocast">autocast</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.best_metric_name" href="base.html#audiocraft.solvers.base.StandardSolver.best_metric_name">best_metric_name</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.commit" href="base.html#audiocraft.solvers.base.StandardSolver.commit">commit</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.common_train_valid" href="base.html#audiocraft.solvers.base.StandardSolver.common_train_valid">common_train_valid</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.generate" href="base.html#audiocraft.solvers.base.StandardSolver.generate">generate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig" href="base.html#audiocraft.solvers.base.StandardSolver.get_eval_solver_from_sig">get_eval_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.initialize_ema" href="base.html#audiocraft.solvers.base.StandardSolver.initialize_ema">initialize_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.load_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.load_checkpoints">load_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.log_model_summary" href="base.html#audiocraft.solvers.base.StandardSolver.log_model_summary">log_model_summary</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_best_state" href="base.html#audiocraft.solvers.base.StandardSolver.register_best_state">register_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_ema" href="base.html#audiocraft.solvers.base.StandardSolver.register_ema">register_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.restore" href="base.html#audiocraft.solvers.base.StandardSolver.restore">restore</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run" href="base.html#audiocraft.solvers.base.StandardSolver.run">run</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_epoch" href="base.html#audiocraft.solvers.base.StandardSolver.run_epoch">run_epoch</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_one_stage" href="base.html#audiocraft.solvers.base.StandardSolver.run_one_stage">run_one_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_step" href="base.html#audiocraft.solvers.base.StandardSolver.run_step">run_step</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.save_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.save_checkpoints">save_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_run_stage" href="base.html#audiocraft.solvers.base.StandardSolver.should_run_stage">should_run_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_stop_training" href="base.html#audiocraft.solvers.base.StandardSolver.should_stop_training">should_stop_training</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.show" href="base.html#audiocraft.solvers.base.StandardSolver.show">show</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.train" href="base.html#audiocraft.solvers.base.StandardSolver.train">train</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.update_best_state_from_stage" href="base.html#audiocraft.solvers.base.StandardSolver.update_best_state_from_stage">update_best_state_from_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.valid" href="base.html#audiocraft.solvers.base.StandardSolver.valid">valid</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.solvers.diffusion.PerStageMetrics"><code class="flex name class">
+<span>class <span class="ident">PerStageMetrics</span></span>
+<span>(</span><span>num_steps: int, num_stages: int = 4)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Handle prompting the metrics per stage.
+It outputs the metrics per range of diffusion states.
+e.g. avg loss when t in [250, 500]</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PerStageMetrics:
+    &#34;&#34;&#34;Handle prompting the metrics per stage.
+    It outputs the metrics per range of diffusion states.
+    e.g. avg loss when t in [250, 500]
+    &#34;&#34;&#34;
+    def __init__(self, num_steps: int, num_stages: int = 4):
+        self.num_steps = num_steps
+        self.num_stages = num_stages
+
+    def __call__(self, losses: dict, step: tp.Union[int, torch.Tensor]):
+        if type(step) is int:
+            stage = int((step / self.num_steps) * self.num_stages)
+            return {f&#34;{name}_{stage}&#34;: loss for name, loss in losses.items()}
+        elif type(step) is torch.Tensor:
+            stage_tensor = ((step / self.num_steps) * self.num_stages).long()
+            out: tp.Dict[str, float] = {}
+            for stage_idx in range(self.num_stages):
+                mask = (stage_tensor == stage_idx)
+                N = mask.sum()
+                stage_out = {}
+                if N &gt; 0:  # pass if no elements in the stage
+                    for name, loss in losses.items():
+                        stage_loss = (mask * loss).sum() / N
+                        stage_out[f&#34;{name}_{stage_idx}&#34;] = stage_loss
+                out = {**out, **stage_out}
+            return out</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.diffusion.DataProcess" href="#audiocraft.solvers.diffusion.DataProcess">DataProcess</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.diffusion.DataProcess.inverse_process" href="#audiocraft.solvers.diffusion.DataProcess.inverse_process">inverse_process</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion.DataProcess.process_data" href="#audiocraft.solvers.diffusion.DataProcess.process_data">process_data</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.solvers.diffusion.DiffusionSolver" href="#audiocraft.solvers.diffusion.DiffusionSolver">DiffusionSolver</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.diffusion.DiffusionSolver.build_dataloaders" href="#audiocraft.solvers.diffusion.DiffusionSolver.build_dataloaders">build_dataloaders</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion.DiffusionSolver.build_model" href="#audiocraft.solvers.diffusion.DiffusionSolver.build_model">build_model</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion.DiffusionSolver.evaluate" href="#audiocraft.solvers.diffusion.DiffusionSolver.evaluate">evaluate</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion.DiffusionSolver.get_condition" href="#audiocraft.solvers.diffusion.DiffusionSolver.get_condition">get_condition</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion.DiffusionSolver.regenerate" href="#audiocraft.solvers.diffusion.DiffusionSolver.regenerate">regenerate</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.solvers.diffusion.PerStageMetrics" href="#audiocraft.solvers.diffusion.PerStageMetrics">PerStageMetrics</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/index.html b/api_docs/audiocraft/solvers/index.html
new file mode 100644
index 00000000..c40e2426
--- /dev/null
+++ b/api_docs/audiocraft/solvers/index.html
@@ -0,0 +1,116 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers API documentation</title>
+<meta name="description" content="Solvers. A Solver is a training recipe, combining the dataloaders, models,
+optimizer, losses etc into a single convenient object." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers</code></h1>
+</header>
+<section id="section-intro">
+<p>Solvers. A Solver is a training recipe, combining the dataloaders, models,
+optimizer, losses etc into a single convenient object.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;
+Solvers. A Solver is a training recipe, combining the dataloaders, models,
+optimizer, losses etc into a single convenient object.
+&#34;&#34;&#34;
+
+# flake8: noqa
+from .audiogen import AudioGenSolver
+from .builders import get_solver
+from .base import StandardSolver
+from .compression import CompressionSolver
+from .musicgen import MusicGenSolver
+from .diffusion import DiffusionSolver</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.solvers.audiogen" href="audiogen.html">audiocraft.solvers.audiogen</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers.base" href="base.html">audiocraft.solvers.base</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers.builders" href="builders.html">audiocraft.solvers.builders</a></code></dt>
+<dd>
+<div class="desc"><p>All the functions to build the relevant solvers and used objects
+from the Hydra config.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers.compression" href="compression.html">audiocraft.solvers.compression</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers.diffusion" href="diffusion.html">audiocraft.solvers.diffusion</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.solvers.musicgen" href="musicgen.html">audiocraft.solvers.musicgen</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.solvers.audiogen" href="audiogen.html">audiocraft.solvers.audiogen</a></code></li>
+<li><code><a title="audiocraft.solvers.base" href="base.html">audiocraft.solvers.base</a></code></li>
+<li><code><a title="audiocraft.solvers.builders" href="builders.html">audiocraft.solvers.builders</a></code></li>
+<li><code><a title="audiocraft.solvers.compression" href="compression.html">audiocraft.solvers.compression</a></code></li>
+<li><code><a title="audiocraft.solvers.diffusion" href="diffusion.html">audiocraft.solvers.diffusion</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen" href="musicgen.html">audiocraft.solvers.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/solvers/musicgen.html b/api_docs/audiocraft/solvers/musicgen.html
new file mode 100644
index 00000000..d61aa7b6
--- /dev/null
+++ b/api_docs/audiocraft/solvers/musicgen.html
@@ -0,0 +1,2096 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.solvers.musicgen API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.solvers.musicgen</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import time
+import typing as tp
+import warnings
+
+import flashy
+import math
+import omegaconf
+import torch
+from torch.nn import functional as F
+
+from . import base, builders
+from .compression import CompressionSolver
+from .. import metrics as eval_metrics
+from .. import models
+from ..data.audio_dataset import AudioDataset
+from ..data.music_dataset import MusicDataset, MusicInfo, AudioInfo
+from ..data.audio_utils import normalize_audio
+from ..modules.conditioners import JointEmbedCondition, SegmentWithAttributes, WavCondition
+from ..utils.cache import CachedBatchWriter, CachedBatchLoader
+from ..utils.samples.manager import SampleManager
+from ..utils.utils import get_dataset_from_loader, is_jsonable, warn_once, model_hash
+
+
+class MusicGenSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for MusicGen training task.
+
+    Used in: https://arxiv.org/abs/2306.05284
+    &#34;&#34;&#34;
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.MUSIC
+
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        # easier access to sampling parameters
+        self.generation_params = {
+            &#39;use_sampling&#39;: self.cfg.generate.lm.use_sampling,
+            &#39;temp&#39;: self.cfg.generate.lm.temp,
+            &#39;top_k&#39;: self.cfg.generate.lm.top_k,
+            &#39;top_p&#39;: self.cfg.generate.lm.top_p,
+        }
+        self._best_metric_name: tp.Optional[str] = &#39;ce&#39;
+
+        self._cached_batch_writer = None
+        self._cached_batch_loader = None
+        if cfg.cache.path:
+            if cfg.cache.write:
+                self._cached_batch_writer = CachedBatchWriter(Path(cfg.cache.path))
+                if self.cfg.cache.write_num_shards:
+                    self.logger.warning(&#34;Multiple shard cache, best_metric_name will be set to None.&#34;)
+                    self._best_metric_name = None
+            else:
+                self._cached_batch_loader = CachedBatchLoader(
+                    Path(cfg.cache.path), cfg.dataset.batch_size, cfg.dataset.num_workers,
+                    min_length=self.cfg.optim.updates_per_epoch or 1)
+                self.dataloaders[&#39;original_train&#39;] = self.dataloaders[&#39;train&#39;]
+                self.dataloaders[&#39;train&#39;] = self._cached_batch_loader  # type: ignore
+
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        &#34;&#34;&#34;Mostly a convenience function around magma.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+        and with minimal memory overhead.
+
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+            device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        &#34;&#34;&#34;
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+        our_override_cfg[&#39;autocast&#39;] = autocast
+        if dtype is not None:
+            our_override_cfg[&#39;dtype&#39;] = dtype
+        if device is not None:
+            our_override_cfg[&#39;device&#39;] = device
+        if batch_size is not None:
+            our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+        solver.model.eval()
+        return solver
+
+    def get_formatter(self, stage_name: str) -&gt; flashy.Formatter:
+        return flashy.Formatter({
+            &#39;lr&#39;: &#39;.2E&#39;,
+            &#39;ce&#39;: &#39;.3f&#39;,
+            &#39;ppl&#39;: &#39;.3f&#39;,
+            &#39;grad_norm&#39;: &#39;.3E&#39;,
+        }, exclude_keys=[&#39;ce_q*&#39;, &#39;ppl_q*&#39;])
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        return self._best_metric_name
+
+    def build_model(self) -&gt; None:
+        &#34;&#34;&#34;Instantiate models and optimizer.&#34;&#34;&#34;
+        # we can potentially not use all quantizers with which the EnCodec model was trained
+        # (e.g. we trained the model with quantizers dropout)
+        self.compression_model = CompressionSolver.wrapped_model_from_checkpoint(
+            self.cfg, self.cfg.compression_model_checkpoint, device=self.device)
+        assert self.compression_model.sample_rate == self.cfg.sample_rate, (
+            f&#34;Compression model sample rate is {self.compression_model.sample_rate} but &#34;
+            f&#34;Solver sample rate is {self.cfg.sample_rate}.&#34;
+            )
+        # ensure we have matching configuration between LM and compression model
+        assert self.cfg.transformer_lm.card == self.compression_model.cardinality, (
+            &#34;Cardinalities of the LM and compression model don&#39;t match: &#34;,
+            f&#34;LM cardinality is {self.cfg.transformer_lm.card} vs &#34;,
+            f&#34;compression model cardinality is {self.compression_model.cardinality}&#34;
+        )
+        assert self.cfg.transformer_lm.n_q == self.compression_model.num_codebooks, (
+            &#34;Numbers of codebooks of the LM and compression models don&#39;t match: &#34;,
+            f&#34;LM number of codebooks is {self.cfg.transformer_lm.n_q} vs &#34;,
+            f&#34;compression model numer of codebooks is {self.compression_model.num_codebooks}&#34;
+        )
+        self.logger.info(&#34;Compression model has %d codebooks with %d cardinality, and a framerate of %d&#34;,
+                         self.compression_model.num_codebooks, self.compression_model.cardinality,
+                         self.compression_model.frame_rate)
+        # instantiate LM model
+        self.model: models.LMModel = models.builders.get_lm_model(self.cfg).to(self.device)
+        if self.cfg.fsdp.use:
+            assert not self.cfg.autocast, &#34;Cannot use autocast with fsdp&#34;
+            self.model = self.wrap_with_fsdp(self.model)
+        self.register_ema(&#39;model&#39;)
+        # initialize optimization
+        self.optimizer = builders.get_optimizer(builders.get_optim_parameter_groups(self.model), self.cfg.optim)
+        self.lr_scheduler = builders.get_lr_scheduler(self.optimizer, self.cfg.schedule, self.total_updates)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;, &#39;lr_scheduler&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.autocast_dtype = {
+            &#39;float16&#39;: torch.float16, &#39;bfloat16&#39;: torch.bfloat16
+        }[self.cfg.autocast_dtype]
+        self.scaler: tp.Optional[torch.cuda.amp.GradScaler] = None
+        if self.cfg.fsdp.use:
+            need_scaler = self.cfg.fsdp.param_dtype == &#39;float16&#39;
+        else:
+            need_scaler = self.cfg.autocast and self.autocast_dtype is torch.float16
+        if need_scaler:
+            if self.cfg.fsdp.use:
+                from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+                self.scaler = ShardedGradScaler()  # type: ignore
+            else:
+                self.scaler = torch.cuda.amp.GradScaler()
+            self.register_stateful(&#39;scaler&#39;)
+
+    def build_dataloaders(self) -&gt; None:
+        &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg, dataset_type=self.DATASET_TYPE)
+
+    def show(self) -&gt; None:
+        &#34;&#34;&#34;Show the compression model and LM model.&#34;&#34;&#34;
+        self.logger.info(&#34;Compression model:&#34;)
+        self.log_model_summary(self.compression_model)
+        self.logger.info(&#34;LM model:&#34;)
+        self.log_model_summary(self.model)
+
+    def load_state_dict(self, state: dict) -&gt; None:
+        if &#39;condition_provider&#39; in state:
+            model_state = state[&#39;model&#39;]
+            condition_provider_state = state.pop(&#39;condition_provider&#39;)
+            prefix = &#39;condition_provider.&#39;
+            for key, value in condition_provider_state.items():
+                key = prefix + key
+                assert key not in model_state
+                model_state[key] = value
+        if &#39;compression_model&#39; in state:
+            # We used to store the `compression_model` state in the checkpoint, however
+            # this is in general not needed, as the compression model should always be readable
+            # from the original `cfg.compression_model_checkpoint` location.
+            compression_model_state = state.pop(&#39;compression_model&#39;)
+            before_hash = model_hash(self.compression_model)
+            self.compression_model.load_state_dict(compression_model_state)
+            after_hash = model_hash(self.compression_model)
+            if before_hash != after_hash:
+                raise RuntimeError(
+                    &#34;The compression model state inside the checkpoint is different&#34;
+                    &#34; from the one obtained from compression_model_checkpoint...&#34;
+                    &#34;We do not support altering the compression model inside the LM &#34;
+                    &#34;checkpoint as parts of the code, in particular for running eval post-training &#34;
+                    &#34;will use the compression_model_checkpoint as the source of truth.&#34;)
+
+        super().load_state_dict(state)
+
+    def load_from_pretrained(self, name: str):
+        # TODO: support native HF versions of MusicGen.
+        lm_pkg = models.loaders.load_lm_model_ckpt(name)
+        state: dict = {
+            &#39;best_state&#39;: {
+                &#39;model&#39;: lm_pkg[&#39;best_state&#39;],
+            },
+        }
+        return state
+
+    def _compute_cross_entropy(
+        self, logits: torch.Tensor, targets: torch.Tensor, mask: torch.Tensor
+    ) -&gt; tp.Tuple[torch.Tensor, tp.List[torch.Tensor]]:
+        &#34;&#34;&#34;Compute cross entropy between multi-codebook targets and model&#39;s logits.
+        The cross entropy is computed per codebook to provide codebook-level cross entropy.
+        Valid timesteps for each of the codebook are pulled from the mask, where invalid
+        timesteps are set to 0.
+
+        Args:
+            logits (torch.Tensor): Model&#39;s logits of shape [B, K, T, card].
+            targets (torch.Tensor): Target codes, of shape [B, K, T].
+            mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
+        Returns:
+            ce (torch.Tensor): Cross entropy averaged over the codebooks
+            ce_per_codebook (list of torch.Tensor): Cross entropy per codebook (detached).
+        &#34;&#34;&#34;
+        B, K, T = targets.shape
+        assert logits.shape[:-1] == targets.shape
+        assert mask.shape == targets.shape
+        ce = torch.zeros([], device=targets.device)
+        ce_per_codebook: tp.List[torch.Tensor] = []
+        for k in range(K):
+            logits_k = logits[:, k, ...].contiguous().view(-1, logits.size(-1))  # [B x T, card]
+            targets_k = targets[:, k, ...].contiguous().view(-1)  # [B x T]
+            mask_k = mask[:, k, ...].contiguous().view(-1)  # [B x T]
+            ce_targets = targets_k[mask_k]
+            ce_logits = logits_k[mask_k]
+            q_ce = F.cross_entropy(ce_logits, ce_targets)
+            ce += q_ce
+            ce_per_codebook.append(q_ce.detach())
+        # average cross entropy across codebooks
+        ce = ce / K
+        return ce, ce_per_codebook
+
+    def _prepare_tokens_and_attributes(
+        self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+        check_synchronization_points: bool = False
+    ) -&gt; tp.Tuple[dict, torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Prepare input batchs for language model training.
+
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]): Input batch with audio tensor of shape [B, C, T]
+                and corresponding metadata as SegmentWithAttributes (with B items).
+            check_synchronization_points (bool): Whether to check for synchronization points slowing down training.
+        Returns:
+            Condition tensors (dict[str, any]): Preprocessed condition attributes.
+            Tokens (torch.Tensor): Audio tokens from compression model, of shape [B, K, T_s],
+                with B the batch size, K the number of codebooks, T_s the token timesteps.
+            Padding mask (torch.Tensor): Mask with valid positions in the tokens tensor, of shape [B, K, T_s].
+        &#34;&#34;&#34;
+        if self.model.training:
+            warnings.warn(
+                &#34;Up to version 1.0.1, the _prepare_tokens_and_attributes was evaluated with `torch.no_grad()`. &#34;
+                &#34;This is inconsistent with how model were trained in the MusicGen paper. We removed the &#34;
+                &#34;`torch.no_grad()` in version 1.1.0. Small changes to the final performance are expected. &#34;
+                &#34;Really sorry about that.&#34;)
+        if self._cached_batch_loader is None or self.current_stage != &#34;train&#34;:
+            audio, infos = batch
+            audio = audio.to(self.device)
+            audio_tokens = None
+            assert audio.size(0) == len(infos), (
+                f&#34;Mismatch between number of items in audio batch ({audio.size(0)})&#34;,
+                f&#34; and in metadata ({len(infos)})&#34;
+            )
+        else:
+            audio = None
+            # In that case the batch will be a tuple coming from the _cached_batch_writer bit below.
+            infos, = batch  # type: ignore
+            assert all([isinstance(info, AudioInfo) for info in infos])
+            assert all([info.audio_tokens is not None for info in infos])  # type: ignore
+            audio_tokens = torch.stack([info.audio_tokens for info in infos]).to(self.device)  # type: ignore
+            audio_tokens = audio_tokens.long()
+            for info in infos:
+                if isinstance(info, MusicInfo):
+                    # Careful here, if you want to use this condition_wav (e.b. chroma conditioning),
+                    # then you must be using the chroma cache! otherwise the code will try
+                    # to use this segment and fail (by that I mean you will see NaN everywhere).
+                    info.self_wav = WavCondition(
+                        torch.full([1, info.channels, info.total_frames], float(&#39;NaN&#39;)),
+                        length=torch.tensor([info.n_frames]),
+                        sample_rate=[info.sample_rate],
+                        path=[info.meta.path],
+                        seek_time=[info.seek_time])
+                    dataset = get_dataset_from_loader(self.dataloaders[&#39;original_train&#39;])
+                    assert isinstance(dataset, MusicDataset), type(dataset)
+                    if dataset.paraphraser is not None and info.description is not None:
+                        # Hackingly reapplying paraphraser when using cache.
+                        info.description = dataset.paraphraser.sample_paraphrase(
+                            info.meta.path, info.description)
+        # prepare attributes
+        attributes = [info.to_condition_attributes() for info in infos]
+        attributes = self.model.cfg_dropout(attributes)
+        attributes = self.model.att_dropout(attributes)
+        tokenized = self.model.condition_provider.tokenize(attributes)
+
+        # Now we should be synchronization free.
+        if self.device == &#34;cuda&#34; and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#34;warn&#34;)
+
+        if audio_tokens is None:
+            with torch.no_grad():
+                audio_tokens, scale = self.compression_model.encode(audio)
+                assert scale is None, &#34;Scaled compression model not supported with LM.&#34;
+
+        with self.autocast:
+            condition_tensors = self.model.condition_provider(tokenized)
+
+        # create a padding mask to hold valid vs invalid positions
+        padding_mask = torch.ones_like(audio_tokens, dtype=torch.bool, device=audio_tokens.device)
+        # replace encodec tokens from padded audio with special_token_id
+        if self.cfg.tokens.padding_with_special_token:
+            audio_tokens = audio_tokens.clone()
+            padding_mask = padding_mask.clone()
+            token_sample_rate = self.compression_model.frame_rate
+            B, K, T_s = audio_tokens.shape
+            for i in range(B):
+                n_samples = infos[i].n_frames
+                audio_sample_rate = infos[i].sample_rate
+                # take the last token generated from actual audio frames (non-padded audio)
+                valid_tokens = math.floor(float(n_samples) / audio_sample_rate * token_sample_rate)
+                audio_tokens[i, :, valid_tokens:] = self.model.special_token_id
+                padding_mask[i, :, valid_tokens:] = 0
+
+        if self.device == &#34;cuda&#34; and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#34;default&#34;)
+
+        if self._cached_batch_writer is not None and self.current_stage == &#39;train&#39;:
+            assert self._cached_batch_loader is None
+            assert audio_tokens is not None
+            for info, one_audio_tokens in zip(infos, audio_tokens):
+                assert isinstance(info, AudioInfo)
+                if isinstance(info, MusicInfo):
+                    assert not info.joint_embed, &#34;joint_embed and cache not supported yet.&#34;
+                    info.self_wav = None
+                assert one_audio_tokens.max() &lt; 2**15, one_audio_tokens.max().item()
+                info.audio_tokens = one_audio_tokens.short().cpu()
+            self._cached_batch_writer.save(infos)
+
+        return condition_tensors, audio_tokens, padding_mask
+
+    def run_step(self, idx: int, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]], metrics: dict) -&gt; dict:
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        check_synchronization_points = idx == 1 and self.device == &#39;cuda&#39;
+
+        condition_tensors, audio_tokens, padding_mask = self._prepare_tokens_and_attributes(
+            batch, check_synchronization_points)
+
+        self.deadlock_detect.update(&#39;tokens_and_conditions&#39;)
+
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#39;warn&#39;)
+
+        with self.autocast:
+            model_output = self.model.compute_predictions(audio_tokens, [], condition_tensors)  # type: ignore
+            logits = model_output.logits
+            mask = padding_mask &amp; model_output.mask
+            ce, ce_per_codebook = self._compute_cross_entropy(logits, audio_tokens, mask)
+            loss = ce
+        self.deadlock_detect.update(&#39;loss&#39;)
+
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#39;default&#39;)
+
+        if self.is_training:
+            metrics[&#39;lr&#39;] = self.optimizer.param_groups[0][&#39;lr&#39;]
+            if self.scaler is not None:
+                loss = self.scaler.scale(loss)
+            self.deadlock_detect.update(&#39;scale&#39;)
+            if self.cfg.fsdp.use:
+                loss.backward()
+                flashy.distrib.average_tensors(self.model.buffers())
+            elif self.cfg.optim.eager_sync:
+                with flashy.distrib.eager_sync_model(self.model):
+                    loss.backward()
+            else:
+                # this should always be slower but can be useful
+                # for weird use cases like multiple backwards.
+                loss.backward()
+                flashy.distrib.sync_model(self.model)
+            self.deadlock_detect.update(&#39;backward&#39;)
+
+            if self.scaler is not None:
+                self.scaler.unscale_(self.optimizer)
+            if self.cfg.optim.max_norm:
+                if self.cfg.fsdp.use:
+                    metrics[&#39;grad_norm&#39;] = self.model.clip_grad_norm_(self.cfg.optim.max_norm)  # type: ignore
+                else:
+                    metrics[&#39;grad_norm&#39;] = torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.cfg.optim.max_norm
+                    )
+            if self.scaler is None:
+                self.optimizer.step()
+            else:
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            if self.lr_scheduler:
+                self.lr_scheduler.step()
+            self.optimizer.zero_grad()
+            self.deadlock_detect.update(&#39;optim&#39;)
+            if self.scaler is not None:
+                scale = self.scaler.get_scale()
+                metrics[&#39;grad_scale&#39;] = scale
+            if not loss.isfinite().all():
+                raise RuntimeError(&#34;Model probably diverged.&#34;)
+
+        metrics[&#39;ce&#39;] = ce
+        metrics[&#39;ppl&#39;] = torch.exp(ce)
+        for k, ce_q in enumerate(ce_per_codebook):
+            metrics[f&#39;ce_q{k + 1}&#39;] = ce_q
+            metrics[f&#39;ppl_q{k + 1}&#39;] = torch.exp(ce_q)
+
+        return metrics
+
+    @torch.no_grad()
+    def run_generate_step(self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+                          gen_duration: float, prompt_duration: tp.Optional[float] = None,
+                          remove_prompt: bool = False,
+                          **generation_params) -&gt; dict:
+        &#34;&#34;&#34;Run generate step on a batch of optional audio tensor and corresponding attributes.
+
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]):
+            use_prompt (bool): Whether to do audio continuation generation with prompt from audio batch.
+            gen_duration (float): Target audio duration for the generation.
+            prompt_duration (float, optional): Duration for the audio prompt to use for continuation.
+            remove_prompt (bool, optional): Whether to remove the prompt from the generated audio.
+            generation_params: Additional generation parameters.
+        Returns:
+            gen_outputs (dict): Generation outputs, consisting in audio, audio tokens from both the generation
+                and the prompt along with additional information.
+        &#34;&#34;&#34;
+        bench_start = time.time()
+        audio, meta = batch
+        assert audio.size(0) == len(meta), (
+            f&#34;Mismatch between number of items in audio batch ({audio.size(0)})&#34;,
+            f&#34; and in metadata ({len(meta)})&#34;
+        )
+        # prepare attributes
+        attributes = [x.to_condition_attributes() for x in meta]
+        # TODO: Add dropout for chroma?
+
+        # prepare audio prompt
+        if prompt_duration is None:
+            prompt_audio = None
+        else:
+            assert prompt_duration &lt; gen_duration, &#34;Prompt duration must be lower than target generation duration&#34;
+            prompt_audio_frames = int(prompt_duration * self.compression_model.sample_rate)
+            prompt_audio = audio[..., :prompt_audio_frames]
+
+        # get audio tokens from compression model
+        if prompt_audio is None or prompt_audio.nelement() == 0:
+            num_samples = len(attributes)
+            prompt_tokens = None
+        else:
+            num_samples = None
+            prompt_audio = prompt_audio.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt_audio)
+            assert scale is None, &#34;Compression model in MusicGen should not require rescaling.&#34;
+
+        # generate by sampling from the LM
+        with self.autocast:
+            total_gen_len = math.ceil(gen_duration * self.compression_model.frame_rate)
+            gen_tokens = self.model.generate(
+                prompt_tokens, attributes, max_gen_len=total_gen_len,
+                num_samples=num_samples, **self.generation_params)
+
+        # generate audio from tokens
+        assert gen_tokens.dim() == 3
+        gen_audio = self.compression_model.decode(gen_tokens, None)
+
+        bench_end = time.time()
+        gen_outputs = {
+            &#39;rtf&#39;: (bench_end - bench_start) / gen_duration,
+            &#39;ref_audio&#39;: audio,
+            &#39;gen_audio&#39;: gen_audio,
+            &#39;gen_tokens&#39;: gen_tokens,
+            &#39;prompt_audio&#39;: prompt_audio,
+            &#39;prompt_tokens&#39;: prompt_tokens,
+        }
+        return gen_outputs
+
+    def generate_audio(self) -&gt; dict:
+        &#34;&#34;&#34;Audio generation stage.&#34;&#34;&#34;
+        generate_stage_name = f&#39;{self.current_stage}&#39;
+        sample_manager = SampleManager(self.xp)
+        self.logger.info(f&#34;Generating samples in {sample_manager.base_folder}&#34;)
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        dataset = get_dataset_from_loader(loader)
+        dataset_duration = dataset.segment_duration
+        assert dataset_duration is not None
+        assert isinstance(dataset, AudioDataset)
+        target_duration = self.cfg.generate.lm.gen_duration
+        prompt_duration = self.cfg.generate.lm.prompt_duration
+        if target_duration is None:
+            target_duration = dataset_duration
+        if prompt_duration is None:
+            prompt_duration = dataset_duration / 4
+        assert prompt_duration &lt; dataset_duration, (
+            f&#34;Specified prompt duration ({prompt_duration}s) is longer&#34;,
+            f&#34; than reference audio duration ({dataset_duration}s)&#34;
+        )
+
+        def get_hydrated_conditions(meta: tp.List[SegmentWithAttributes]):
+            hydrated_conditions = []
+            for sample in [x.to_condition_attributes() for x in meta]:
+                cond_dict = {}
+                for cond_type in sample.__annotations__.keys():
+                    for cond_key, cond_val in getattr(sample, cond_type).items():
+                        if cond_key not in self.model.condition_provider.conditioners.keys():
+                            continue
+                        if is_jsonable(cond_val):
+                            cond_dict[cond_key] = cond_val
+                        elif isinstance(cond_val, WavCondition):
+                            cond_dict[cond_key] = cond_val.path
+                        elif isinstance(cond_val, JointEmbedCondition):
+                            cond_dict[cond_key] = cond_val.text  # only support text at inference for now
+                        else:
+                            # if we reached this point, it is not clear how to log the condition
+                            # so we just log the type.
+                            cond_dict[cond_key] = str(type(cond_val))
+                            continue
+                hydrated_conditions.append(cond_dict)
+            return hydrated_conditions
+
+        metrics: dict = {}
+        average = flashy.averager()
+        for batch in lp:
+            audio, meta = batch
+            # metadata for sample manager
+            hydrated_conditions = get_hydrated_conditions(meta)
+            sample_generation_params = {
+                **{f&#39;classifier_free_guidance_{k}&#39;: v for k, v in self.cfg.classifier_free_guidance.items()},
+                **self.generation_params
+            }
+            if self.cfg.generate.lm.unprompted_samples:
+                if self.cfg.generate.lm.gen_gt_samples:
+                    # get the ground truth instead of generation
+                    self.logger.warn(
+                        &#34;Use ground truth instead of audio generation as generate.lm.gen_gt_samples=true&#34;)
+                    gen_unprompted_audio = audio
+                    rtf = 1.
+                else:
+                    gen_unprompted_outputs = self.run_generate_step(
+                        batch, gen_duration=target_duration, prompt_duration=None,
+                        **self.generation_params)
+                    gen_unprompted_audio = gen_unprompted_outputs[&#39;gen_audio&#39;].cpu()
+                    rtf = gen_unprompted_outputs[&#39;rtf&#39;]
+                sample_manager.add_samples(
+                    gen_unprompted_audio, self.epoch, hydrated_conditions,
+                    ground_truth_wavs=audio, generation_args=sample_generation_params)
+
+            if self.cfg.generate.lm.prompted_samples:
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration, prompt_duration=prompt_duration,
+                    **self.generation_params)
+                gen_audio = gen_outputs[&#39;gen_audio&#39;].cpu()
+                prompt_audio = gen_outputs[&#39;prompt_audio&#39;].cpu()
+                sample_manager.add_samples(
+                    gen_audio, self.epoch, hydrated_conditions,
+                    prompt_wavs=prompt_audio, ground_truth_wavs=audio,
+                    generation_args=sample_generation_params)
+
+            metrics[&#39;rtf&#39;] = rtf
+            metrics = average(metrics)
+
+        flashy.distrib.barrier()
+        return metrics
+
+    def generate(self) -&gt; dict:
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        self.model.eval()
+        with torch.no_grad():
+            return self.generate_audio()
+
+    def run_epoch(self):
+        if self.cfg.cache.write:
+            if ((self.epoch - 1) % self.cfg.cache.write_num_shards) != self.cfg.cache.write_shard:
+                return
+        super().run_epoch()
+
+    def train(self):
+        &#34;&#34;&#34;Train stage.
+        &#34;&#34;&#34;
+        if self._cached_batch_writer is not None:
+            self._cached_batch_writer.start_epoch(self.epoch)
+        if self._cached_batch_loader is None:
+            dataset = get_dataset_from_loader(self.dataloaders[&#39;train&#39;])
+            assert isinstance(dataset, AudioDataset)
+            dataset.current_epoch = self.epoch
+        else:
+            self._cached_batch_loader.start_epoch(self.epoch)
+        return super().train()
+
+    def evaluate_audio_generation(self) -&gt; dict:
+        &#34;&#34;&#34;Evaluate audio generation with off-the-shelf metrics.&#34;&#34;&#34;
+        evaluate_stage_name = f&#39;{self.current_stage}_generation&#39;
+        # instantiate evaluation metrics, if at least one metric is defined, run audio generation evaluation
+        fad: tp.Optional[eval_metrics.FrechetAudioDistanceMetric] = None
+        kldiv: tp.Optional[eval_metrics.KLDivergenceMetric] = None
+        text_consistency: tp.Optional[eval_metrics.TextConsistencyMetric] = None
+        chroma_cosine: tp.Optional[eval_metrics.ChromaCosineSimilarityMetric] = None
+        should_run_eval = False
+        eval_chroma_wavs: tp.Optional[torch.Tensor] = None
+        if self.cfg.evaluate.metrics.fad:
+            fad = builders.get_fad(self.cfg.metrics.fad).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.kld:
+            kldiv = builders.get_kldiv(self.cfg.metrics.kld).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.text_consistency:
+            text_consistency = builders.get_text_consistency(self.cfg.metrics.text_consistency).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.chroma_cosine:
+            chroma_cosine = builders.get_chroma_cosine_similarity(self.cfg.metrics.chroma_cosine).to(self.device)
+            # if we have predefind wavs for chroma we should purge them for computing the cosine metric
+            has_predefined_eval_chromas = &#39;self_wav&#39; in self.model.condition_provider.conditioners and \
+                                          self.model.condition_provider.conditioners[&#39;self_wav&#39;].has_eval_wavs()
+            if has_predefined_eval_chromas:
+                warn_once(self.logger, &#34;Attempting to run cosine eval for config with pre-defined eval chromas! &#34;
+                                       &#39;Resetting eval chromas to None for evaluation.&#39;)
+                eval_chroma_wavs = self.model.condition_provider.conditioners.self_wav.eval_wavs  # type: ignore
+                self.model.condition_provider.conditioners.self_wav.reset_eval_wavs(None)  # type: ignore
+            should_run_eval = True
+
+        def get_compressed_audio(audio: torch.Tensor) -&gt; torch.Tensor:
+            audio_tokens, scale = self.compression_model.encode(audio.to(self.device))
+            compressed_audio = self.compression_model.decode(audio_tokens, scale)
+            return compressed_audio[..., :audio.shape[-1]]
+
+        metrics: dict = {}
+        if should_run_eval:
+            loader = self.dataloaders[&#39;evaluate&#39;]
+            updates = len(loader)
+            lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+            average = flashy.averager()
+            dataset = get_dataset_from_loader(loader)
+            assert isinstance(dataset, AudioDataset)
+            self.logger.info(f&#34;Computing evaluation metrics on {len(dataset)} samples&#34;)
+
+            for idx, batch in enumerate(lp):
+                audio, meta = batch
+                assert all([self.cfg.sample_rate == m.sample_rate for m in meta])
+
+                target_duration = audio.shape[-1] / self.cfg.sample_rate
+                if self.cfg.evaluate.fixed_generation_duration:
+                    target_duration = self.cfg.evaluate.fixed_generation_duration
+
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration,
+                    **self.generation_params
+                )
+                y_pred = gen_outputs[&#39;gen_audio&#39;].detach()
+                y_pred = y_pred[..., :audio.shape[-1]]
+
+                normalize_kwargs = dict(self.cfg.generate.audio)
+                normalize_kwargs.pop(&#39;format&#39;, None)
+                y_pred = torch.stack([normalize_audio(w, **normalize_kwargs) for w in y_pred], dim=0).cpu()
+                y = audio.cpu()  # should already be on CPU but just in case
+                sizes = torch.tensor([m.n_frames for m in meta])  # actual sizes without padding
+                sample_rates = torch.tensor([m.sample_rate for m in meta])  # sample rates for audio samples
+                audio_stems = [Path(m.meta.path).stem + f&#34;_{m.seek_time}&#34; for m in meta]
+
+                if fad is not None:
+                    if self.cfg.metrics.fad.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    fad.update(y_pred, y, sizes, sample_rates, audio_stems)
+                if kldiv is not None:
+                    if self.cfg.metrics.kld.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    kldiv.update(y_pred, y, sizes, sample_rates)
+                if text_consistency is not None:
+                    texts = [m.description for m in meta]
+                    if self.cfg.metrics.text_consistency.use_gt:
+                        y_pred = y
+                    text_consistency.update(y_pred, texts, sizes, sample_rates)
+                if chroma_cosine is not None:
+                    if self.cfg.metrics.chroma_cosine.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    chroma_cosine.update(y_pred, y, sizes, sample_rates)
+                    # restore chroma conditioner&#39;s eval chroma wavs
+                    if eval_chroma_wavs is not None:
+                        self.model.condition_provider.conditioners[&#39;self_wav&#39;].reset_eval_wavs(eval_chroma_wavs)
+
+            flashy.distrib.barrier()
+            if fad is not None:
+                metrics[&#39;fad&#39;] = fad.compute()
+            if kldiv is not None:
+                kld_metrics = kldiv.compute()
+                metrics.update(kld_metrics)
+            if text_consistency is not None:
+                metrics[&#39;text_consistency&#39;] = text_consistency.compute()
+            if chroma_cosine is not None:
+                metrics[&#39;chroma_cosine&#39;] = chroma_cosine.compute()
+            metrics = average(metrics)
+            metrics = flashy.distrib.average_metrics(metrics, len(loader))
+
+        return metrics
+
+    def evaluate(self) -&gt; dict:
+        &#34;&#34;&#34;Evaluate stage.&#34;&#34;&#34;
+        self.model.eval()
+        with torch.no_grad():
+            metrics: dict = {}
+            if self.cfg.evaluate.metrics.base:
+                metrics.update(self.common_train_valid(&#39;evaluate&#39;))
+            gen_metrics = self.evaluate_audio_generation()
+            return {**metrics, **gen_metrics}</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver"><code class="flex name class">
+<span>class <span class="ident">MusicGenSolver</span></span>
+<span>(</span><span>cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Solver for MusicGen training task.</p>
+<p>Used in: <a href="https://arxiv.org/abs/2306.05284">https://arxiv.org/abs/2306.05284</a></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicGenSolver(base.StandardSolver):
+    &#34;&#34;&#34;Solver for MusicGen training task.
+
+    Used in: https://arxiv.org/abs/2306.05284
+    &#34;&#34;&#34;
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.MUSIC
+
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        # easier access to sampling parameters
+        self.generation_params = {
+            &#39;use_sampling&#39;: self.cfg.generate.lm.use_sampling,
+            &#39;temp&#39;: self.cfg.generate.lm.temp,
+            &#39;top_k&#39;: self.cfg.generate.lm.top_k,
+            &#39;top_p&#39;: self.cfg.generate.lm.top_p,
+        }
+        self._best_metric_name: tp.Optional[str] = &#39;ce&#39;
+
+        self._cached_batch_writer = None
+        self._cached_batch_loader = None
+        if cfg.cache.path:
+            if cfg.cache.write:
+                self._cached_batch_writer = CachedBatchWriter(Path(cfg.cache.path))
+                if self.cfg.cache.write_num_shards:
+                    self.logger.warning(&#34;Multiple shard cache, best_metric_name will be set to None.&#34;)
+                    self._best_metric_name = None
+            else:
+                self._cached_batch_loader = CachedBatchLoader(
+                    Path(cfg.cache.path), cfg.dataset.batch_size, cfg.dataset.num_workers,
+                    min_length=self.cfg.optim.updates_per_epoch or 1)
+                self.dataloaders[&#39;original_train&#39;] = self.dataloaders[&#39;train&#39;]
+                self.dataloaders[&#39;train&#39;] = self._cached_batch_loader  # type: ignore
+
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        &#34;&#34;&#34;Mostly a convenience function around magma.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+        and with minimal memory overhead.
+
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+            device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        &#34;&#34;&#34;
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+        our_override_cfg[&#39;autocast&#39;] = autocast
+        if dtype is not None:
+            our_override_cfg[&#39;dtype&#39;] = dtype
+        if device is not None:
+            our_override_cfg[&#39;device&#39;] = device
+        if batch_size is not None:
+            our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+        solver.model.eval()
+        return solver
+
+    def get_formatter(self, stage_name: str) -&gt; flashy.Formatter:
+        return flashy.Formatter({
+            &#39;lr&#39;: &#39;.2E&#39;,
+            &#39;ce&#39;: &#39;.3f&#39;,
+            &#39;ppl&#39;: &#39;.3f&#39;,
+            &#39;grad_norm&#39;: &#39;.3E&#39;,
+        }, exclude_keys=[&#39;ce_q*&#39;, &#39;ppl_q*&#39;])
+
+    @property
+    def best_metric_name(self) -&gt; tp.Optional[str]:
+        return self._best_metric_name
+
+    def build_model(self) -&gt; None:
+        &#34;&#34;&#34;Instantiate models and optimizer.&#34;&#34;&#34;
+        # we can potentially not use all quantizers with which the EnCodec model was trained
+        # (e.g. we trained the model with quantizers dropout)
+        self.compression_model = CompressionSolver.wrapped_model_from_checkpoint(
+            self.cfg, self.cfg.compression_model_checkpoint, device=self.device)
+        assert self.compression_model.sample_rate == self.cfg.sample_rate, (
+            f&#34;Compression model sample rate is {self.compression_model.sample_rate} but &#34;
+            f&#34;Solver sample rate is {self.cfg.sample_rate}.&#34;
+            )
+        # ensure we have matching configuration between LM and compression model
+        assert self.cfg.transformer_lm.card == self.compression_model.cardinality, (
+            &#34;Cardinalities of the LM and compression model don&#39;t match: &#34;,
+            f&#34;LM cardinality is {self.cfg.transformer_lm.card} vs &#34;,
+            f&#34;compression model cardinality is {self.compression_model.cardinality}&#34;
+        )
+        assert self.cfg.transformer_lm.n_q == self.compression_model.num_codebooks, (
+            &#34;Numbers of codebooks of the LM and compression models don&#39;t match: &#34;,
+            f&#34;LM number of codebooks is {self.cfg.transformer_lm.n_q} vs &#34;,
+            f&#34;compression model numer of codebooks is {self.compression_model.num_codebooks}&#34;
+        )
+        self.logger.info(&#34;Compression model has %d codebooks with %d cardinality, and a framerate of %d&#34;,
+                         self.compression_model.num_codebooks, self.compression_model.cardinality,
+                         self.compression_model.frame_rate)
+        # instantiate LM model
+        self.model: models.LMModel = models.builders.get_lm_model(self.cfg).to(self.device)
+        if self.cfg.fsdp.use:
+            assert not self.cfg.autocast, &#34;Cannot use autocast with fsdp&#34;
+            self.model = self.wrap_with_fsdp(self.model)
+        self.register_ema(&#39;model&#39;)
+        # initialize optimization
+        self.optimizer = builders.get_optimizer(builders.get_optim_parameter_groups(self.model), self.cfg.optim)
+        self.lr_scheduler = builders.get_lr_scheduler(self.optimizer, self.cfg.schedule, self.total_updates)
+        self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;, &#39;lr_scheduler&#39;)
+        self.register_best_state(&#39;model&#39;)
+        self.autocast_dtype = {
+            &#39;float16&#39;: torch.float16, &#39;bfloat16&#39;: torch.bfloat16
+        }[self.cfg.autocast_dtype]
+        self.scaler: tp.Optional[torch.cuda.amp.GradScaler] = None
+        if self.cfg.fsdp.use:
+            need_scaler = self.cfg.fsdp.param_dtype == &#39;float16&#39;
+        else:
+            need_scaler = self.cfg.autocast and self.autocast_dtype is torch.float16
+        if need_scaler:
+            if self.cfg.fsdp.use:
+                from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+                self.scaler = ShardedGradScaler()  # type: ignore
+            else:
+                self.scaler = torch.cuda.amp.GradScaler()
+            self.register_stateful(&#39;scaler&#39;)
+
+    def build_dataloaders(self) -&gt; None:
+        &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+        self.dataloaders = builders.get_audio_datasets(self.cfg, dataset_type=self.DATASET_TYPE)
+
+    def show(self) -&gt; None:
+        &#34;&#34;&#34;Show the compression model and LM model.&#34;&#34;&#34;
+        self.logger.info(&#34;Compression model:&#34;)
+        self.log_model_summary(self.compression_model)
+        self.logger.info(&#34;LM model:&#34;)
+        self.log_model_summary(self.model)
+
+    def load_state_dict(self, state: dict) -&gt; None:
+        if &#39;condition_provider&#39; in state:
+            model_state = state[&#39;model&#39;]
+            condition_provider_state = state.pop(&#39;condition_provider&#39;)
+            prefix = &#39;condition_provider.&#39;
+            for key, value in condition_provider_state.items():
+                key = prefix + key
+                assert key not in model_state
+                model_state[key] = value
+        if &#39;compression_model&#39; in state:
+            # We used to store the `compression_model` state in the checkpoint, however
+            # this is in general not needed, as the compression model should always be readable
+            # from the original `cfg.compression_model_checkpoint` location.
+            compression_model_state = state.pop(&#39;compression_model&#39;)
+            before_hash = model_hash(self.compression_model)
+            self.compression_model.load_state_dict(compression_model_state)
+            after_hash = model_hash(self.compression_model)
+            if before_hash != after_hash:
+                raise RuntimeError(
+                    &#34;The compression model state inside the checkpoint is different&#34;
+                    &#34; from the one obtained from compression_model_checkpoint...&#34;
+                    &#34;We do not support altering the compression model inside the LM &#34;
+                    &#34;checkpoint as parts of the code, in particular for running eval post-training &#34;
+                    &#34;will use the compression_model_checkpoint as the source of truth.&#34;)
+
+        super().load_state_dict(state)
+
+    def load_from_pretrained(self, name: str):
+        # TODO: support native HF versions of MusicGen.
+        lm_pkg = models.loaders.load_lm_model_ckpt(name)
+        state: dict = {
+            &#39;best_state&#39;: {
+                &#39;model&#39;: lm_pkg[&#39;best_state&#39;],
+            },
+        }
+        return state
+
+    def _compute_cross_entropy(
+        self, logits: torch.Tensor, targets: torch.Tensor, mask: torch.Tensor
+    ) -&gt; tp.Tuple[torch.Tensor, tp.List[torch.Tensor]]:
+        &#34;&#34;&#34;Compute cross entropy between multi-codebook targets and model&#39;s logits.
+        The cross entropy is computed per codebook to provide codebook-level cross entropy.
+        Valid timesteps for each of the codebook are pulled from the mask, where invalid
+        timesteps are set to 0.
+
+        Args:
+            logits (torch.Tensor): Model&#39;s logits of shape [B, K, T, card].
+            targets (torch.Tensor): Target codes, of shape [B, K, T].
+            mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
+        Returns:
+            ce (torch.Tensor): Cross entropy averaged over the codebooks
+            ce_per_codebook (list of torch.Tensor): Cross entropy per codebook (detached).
+        &#34;&#34;&#34;
+        B, K, T = targets.shape
+        assert logits.shape[:-1] == targets.shape
+        assert mask.shape == targets.shape
+        ce = torch.zeros([], device=targets.device)
+        ce_per_codebook: tp.List[torch.Tensor] = []
+        for k in range(K):
+            logits_k = logits[:, k, ...].contiguous().view(-1, logits.size(-1))  # [B x T, card]
+            targets_k = targets[:, k, ...].contiguous().view(-1)  # [B x T]
+            mask_k = mask[:, k, ...].contiguous().view(-1)  # [B x T]
+            ce_targets = targets_k[mask_k]
+            ce_logits = logits_k[mask_k]
+            q_ce = F.cross_entropy(ce_logits, ce_targets)
+            ce += q_ce
+            ce_per_codebook.append(q_ce.detach())
+        # average cross entropy across codebooks
+        ce = ce / K
+        return ce, ce_per_codebook
+
+    def _prepare_tokens_and_attributes(
+        self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+        check_synchronization_points: bool = False
+    ) -&gt; tp.Tuple[dict, torch.Tensor, torch.Tensor]:
+        &#34;&#34;&#34;Prepare input batchs for language model training.
+
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]): Input batch with audio tensor of shape [B, C, T]
+                and corresponding metadata as SegmentWithAttributes (with B items).
+            check_synchronization_points (bool): Whether to check for synchronization points slowing down training.
+        Returns:
+            Condition tensors (dict[str, any]): Preprocessed condition attributes.
+            Tokens (torch.Tensor): Audio tokens from compression model, of shape [B, K, T_s],
+                with B the batch size, K the number of codebooks, T_s the token timesteps.
+            Padding mask (torch.Tensor): Mask with valid positions in the tokens tensor, of shape [B, K, T_s].
+        &#34;&#34;&#34;
+        if self.model.training:
+            warnings.warn(
+                &#34;Up to version 1.0.1, the _prepare_tokens_and_attributes was evaluated with `torch.no_grad()`. &#34;
+                &#34;This is inconsistent with how model were trained in the MusicGen paper. We removed the &#34;
+                &#34;`torch.no_grad()` in version 1.1.0. Small changes to the final performance are expected. &#34;
+                &#34;Really sorry about that.&#34;)
+        if self._cached_batch_loader is None or self.current_stage != &#34;train&#34;:
+            audio, infos = batch
+            audio = audio.to(self.device)
+            audio_tokens = None
+            assert audio.size(0) == len(infos), (
+                f&#34;Mismatch between number of items in audio batch ({audio.size(0)})&#34;,
+                f&#34; and in metadata ({len(infos)})&#34;
+            )
+        else:
+            audio = None
+            # In that case the batch will be a tuple coming from the _cached_batch_writer bit below.
+            infos, = batch  # type: ignore
+            assert all([isinstance(info, AudioInfo) for info in infos])
+            assert all([info.audio_tokens is not None for info in infos])  # type: ignore
+            audio_tokens = torch.stack([info.audio_tokens for info in infos]).to(self.device)  # type: ignore
+            audio_tokens = audio_tokens.long()
+            for info in infos:
+                if isinstance(info, MusicInfo):
+                    # Careful here, if you want to use this condition_wav (e.b. chroma conditioning),
+                    # then you must be using the chroma cache! otherwise the code will try
+                    # to use this segment and fail (by that I mean you will see NaN everywhere).
+                    info.self_wav = WavCondition(
+                        torch.full([1, info.channels, info.total_frames], float(&#39;NaN&#39;)),
+                        length=torch.tensor([info.n_frames]),
+                        sample_rate=[info.sample_rate],
+                        path=[info.meta.path],
+                        seek_time=[info.seek_time])
+                    dataset = get_dataset_from_loader(self.dataloaders[&#39;original_train&#39;])
+                    assert isinstance(dataset, MusicDataset), type(dataset)
+                    if dataset.paraphraser is not None and info.description is not None:
+                        # Hackingly reapplying paraphraser when using cache.
+                        info.description = dataset.paraphraser.sample_paraphrase(
+                            info.meta.path, info.description)
+        # prepare attributes
+        attributes = [info.to_condition_attributes() for info in infos]
+        attributes = self.model.cfg_dropout(attributes)
+        attributes = self.model.att_dropout(attributes)
+        tokenized = self.model.condition_provider.tokenize(attributes)
+
+        # Now we should be synchronization free.
+        if self.device == &#34;cuda&#34; and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#34;warn&#34;)
+
+        if audio_tokens is None:
+            with torch.no_grad():
+                audio_tokens, scale = self.compression_model.encode(audio)
+                assert scale is None, &#34;Scaled compression model not supported with LM.&#34;
+
+        with self.autocast:
+            condition_tensors = self.model.condition_provider(tokenized)
+
+        # create a padding mask to hold valid vs invalid positions
+        padding_mask = torch.ones_like(audio_tokens, dtype=torch.bool, device=audio_tokens.device)
+        # replace encodec tokens from padded audio with special_token_id
+        if self.cfg.tokens.padding_with_special_token:
+            audio_tokens = audio_tokens.clone()
+            padding_mask = padding_mask.clone()
+            token_sample_rate = self.compression_model.frame_rate
+            B, K, T_s = audio_tokens.shape
+            for i in range(B):
+                n_samples = infos[i].n_frames
+                audio_sample_rate = infos[i].sample_rate
+                # take the last token generated from actual audio frames (non-padded audio)
+                valid_tokens = math.floor(float(n_samples) / audio_sample_rate * token_sample_rate)
+                audio_tokens[i, :, valid_tokens:] = self.model.special_token_id
+                padding_mask[i, :, valid_tokens:] = 0
+
+        if self.device == &#34;cuda&#34; and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#34;default&#34;)
+
+        if self._cached_batch_writer is not None and self.current_stage == &#39;train&#39;:
+            assert self._cached_batch_loader is None
+            assert audio_tokens is not None
+            for info, one_audio_tokens in zip(infos, audio_tokens):
+                assert isinstance(info, AudioInfo)
+                if isinstance(info, MusicInfo):
+                    assert not info.joint_embed, &#34;joint_embed and cache not supported yet.&#34;
+                    info.self_wav = None
+                assert one_audio_tokens.max() &lt; 2**15, one_audio_tokens.max().item()
+                info.audio_tokens = one_audio_tokens.short().cpu()
+            self._cached_batch_writer.save(infos)
+
+        return condition_tensors, audio_tokens, padding_mask
+
+    def run_step(self, idx: int, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]], metrics: dict) -&gt; dict:
+        &#34;&#34;&#34;Perform one training or valid step on a given batch.&#34;&#34;&#34;
+        check_synchronization_points = idx == 1 and self.device == &#39;cuda&#39;
+
+        condition_tensors, audio_tokens, padding_mask = self._prepare_tokens_and_attributes(
+            batch, check_synchronization_points)
+
+        self.deadlock_detect.update(&#39;tokens_and_conditions&#39;)
+
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#39;warn&#39;)
+
+        with self.autocast:
+            model_output = self.model.compute_predictions(audio_tokens, [], condition_tensors)  # type: ignore
+            logits = model_output.logits
+            mask = padding_mask &amp; model_output.mask
+            ce, ce_per_codebook = self._compute_cross_entropy(logits, audio_tokens, mask)
+            loss = ce
+        self.deadlock_detect.update(&#39;loss&#39;)
+
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode(&#39;default&#39;)
+
+        if self.is_training:
+            metrics[&#39;lr&#39;] = self.optimizer.param_groups[0][&#39;lr&#39;]
+            if self.scaler is not None:
+                loss = self.scaler.scale(loss)
+            self.deadlock_detect.update(&#39;scale&#39;)
+            if self.cfg.fsdp.use:
+                loss.backward()
+                flashy.distrib.average_tensors(self.model.buffers())
+            elif self.cfg.optim.eager_sync:
+                with flashy.distrib.eager_sync_model(self.model):
+                    loss.backward()
+            else:
+                # this should always be slower but can be useful
+                # for weird use cases like multiple backwards.
+                loss.backward()
+                flashy.distrib.sync_model(self.model)
+            self.deadlock_detect.update(&#39;backward&#39;)
+
+            if self.scaler is not None:
+                self.scaler.unscale_(self.optimizer)
+            if self.cfg.optim.max_norm:
+                if self.cfg.fsdp.use:
+                    metrics[&#39;grad_norm&#39;] = self.model.clip_grad_norm_(self.cfg.optim.max_norm)  # type: ignore
+                else:
+                    metrics[&#39;grad_norm&#39;] = torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.cfg.optim.max_norm
+                    )
+            if self.scaler is None:
+                self.optimizer.step()
+            else:
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            if self.lr_scheduler:
+                self.lr_scheduler.step()
+            self.optimizer.zero_grad()
+            self.deadlock_detect.update(&#39;optim&#39;)
+            if self.scaler is not None:
+                scale = self.scaler.get_scale()
+                metrics[&#39;grad_scale&#39;] = scale
+            if not loss.isfinite().all():
+                raise RuntimeError(&#34;Model probably diverged.&#34;)
+
+        metrics[&#39;ce&#39;] = ce
+        metrics[&#39;ppl&#39;] = torch.exp(ce)
+        for k, ce_q in enumerate(ce_per_codebook):
+            metrics[f&#39;ce_q{k + 1}&#39;] = ce_q
+            metrics[f&#39;ppl_q{k + 1}&#39;] = torch.exp(ce_q)
+
+        return metrics
+
+    @torch.no_grad()
+    def run_generate_step(self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+                          gen_duration: float, prompt_duration: tp.Optional[float] = None,
+                          remove_prompt: bool = False,
+                          **generation_params) -&gt; dict:
+        &#34;&#34;&#34;Run generate step on a batch of optional audio tensor and corresponding attributes.
+
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]):
+            use_prompt (bool): Whether to do audio continuation generation with prompt from audio batch.
+            gen_duration (float): Target audio duration for the generation.
+            prompt_duration (float, optional): Duration for the audio prompt to use for continuation.
+            remove_prompt (bool, optional): Whether to remove the prompt from the generated audio.
+            generation_params: Additional generation parameters.
+        Returns:
+            gen_outputs (dict): Generation outputs, consisting in audio, audio tokens from both the generation
+                and the prompt along with additional information.
+        &#34;&#34;&#34;
+        bench_start = time.time()
+        audio, meta = batch
+        assert audio.size(0) == len(meta), (
+            f&#34;Mismatch between number of items in audio batch ({audio.size(0)})&#34;,
+            f&#34; and in metadata ({len(meta)})&#34;
+        )
+        # prepare attributes
+        attributes = [x.to_condition_attributes() for x in meta]
+        # TODO: Add dropout for chroma?
+
+        # prepare audio prompt
+        if prompt_duration is None:
+            prompt_audio = None
+        else:
+            assert prompt_duration &lt; gen_duration, &#34;Prompt duration must be lower than target generation duration&#34;
+            prompt_audio_frames = int(prompt_duration * self.compression_model.sample_rate)
+            prompt_audio = audio[..., :prompt_audio_frames]
+
+        # get audio tokens from compression model
+        if prompt_audio is None or prompt_audio.nelement() == 0:
+            num_samples = len(attributes)
+            prompt_tokens = None
+        else:
+            num_samples = None
+            prompt_audio = prompt_audio.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt_audio)
+            assert scale is None, &#34;Compression model in MusicGen should not require rescaling.&#34;
+
+        # generate by sampling from the LM
+        with self.autocast:
+            total_gen_len = math.ceil(gen_duration * self.compression_model.frame_rate)
+            gen_tokens = self.model.generate(
+                prompt_tokens, attributes, max_gen_len=total_gen_len,
+                num_samples=num_samples, **self.generation_params)
+
+        # generate audio from tokens
+        assert gen_tokens.dim() == 3
+        gen_audio = self.compression_model.decode(gen_tokens, None)
+
+        bench_end = time.time()
+        gen_outputs = {
+            &#39;rtf&#39;: (bench_end - bench_start) / gen_duration,
+            &#39;ref_audio&#39;: audio,
+            &#39;gen_audio&#39;: gen_audio,
+            &#39;gen_tokens&#39;: gen_tokens,
+            &#39;prompt_audio&#39;: prompt_audio,
+            &#39;prompt_tokens&#39;: prompt_tokens,
+        }
+        return gen_outputs
+
+    def generate_audio(self) -&gt; dict:
+        &#34;&#34;&#34;Audio generation stage.&#34;&#34;&#34;
+        generate_stage_name = f&#39;{self.current_stage}&#39;
+        sample_manager = SampleManager(self.xp)
+        self.logger.info(f&#34;Generating samples in {sample_manager.base_folder}&#34;)
+        loader = self.dataloaders[&#39;generate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+        dataset = get_dataset_from_loader(loader)
+        dataset_duration = dataset.segment_duration
+        assert dataset_duration is not None
+        assert isinstance(dataset, AudioDataset)
+        target_duration = self.cfg.generate.lm.gen_duration
+        prompt_duration = self.cfg.generate.lm.prompt_duration
+        if target_duration is None:
+            target_duration = dataset_duration
+        if prompt_duration is None:
+            prompt_duration = dataset_duration / 4
+        assert prompt_duration &lt; dataset_duration, (
+            f&#34;Specified prompt duration ({prompt_duration}s) is longer&#34;,
+            f&#34; than reference audio duration ({dataset_duration}s)&#34;
+        )
+
+        def get_hydrated_conditions(meta: tp.List[SegmentWithAttributes]):
+            hydrated_conditions = []
+            for sample in [x.to_condition_attributes() for x in meta]:
+                cond_dict = {}
+                for cond_type in sample.__annotations__.keys():
+                    for cond_key, cond_val in getattr(sample, cond_type).items():
+                        if cond_key not in self.model.condition_provider.conditioners.keys():
+                            continue
+                        if is_jsonable(cond_val):
+                            cond_dict[cond_key] = cond_val
+                        elif isinstance(cond_val, WavCondition):
+                            cond_dict[cond_key] = cond_val.path
+                        elif isinstance(cond_val, JointEmbedCondition):
+                            cond_dict[cond_key] = cond_val.text  # only support text at inference for now
+                        else:
+                            # if we reached this point, it is not clear how to log the condition
+                            # so we just log the type.
+                            cond_dict[cond_key] = str(type(cond_val))
+                            continue
+                hydrated_conditions.append(cond_dict)
+            return hydrated_conditions
+
+        metrics: dict = {}
+        average = flashy.averager()
+        for batch in lp:
+            audio, meta = batch
+            # metadata for sample manager
+            hydrated_conditions = get_hydrated_conditions(meta)
+            sample_generation_params = {
+                **{f&#39;classifier_free_guidance_{k}&#39;: v for k, v in self.cfg.classifier_free_guidance.items()},
+                **self.generation_params
+            }
+            if self.cfg.generate.lm.unprompted_samples:
+                if self.cfg.generate.lm.gen_gt_samples:
+                    # get the ground truth instead of generation
+                    self.logger.warn(
+                        &#34;Use ground truth instead of audio generation as generate.lm.gen_gt_samples=true&#34;)
+                    gen_unprompted_audio = audio
+                    rtf = 1.
+                else:
+                    gen_unprompted_outputs = self.run_generate_step(
+                        batch, gen_duration=target_duration, prompt_duration=None,
+                        **self.generation_params)
+                    gen_unprompted_audio = gen_unprompted_outputs[&#39;gen_audio&#39;].cpu()
+                    rtf = gen_unprompted_outputs[&#39;rtf&#39;]
+                sample_manager.add_samples(
+                    gen_unprompted_audio, self.epoch, hydrated_conditions,
+                    ground_truth_wavs=audio, generation_args=sample_generation_params)
+
+            if self.cfg.generate.lm.prompted_samples:
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration, prompt_duration=prompt_duration,
+                    **self.generation_params)
+                gen_audio = gen_outputs[&#39;gen_audio&#39;].cpu()
+                prompt_audio = gen_outputs[&#39;prompt_audio&#39;].cpu()
+                sample_manager.add_samples(
+                    gen_audio, self.epoch, hydrated_conditions,
+                    prompt_wavs=prompt_audio, ground_truth_wavs=audio,
+                    generation_args=sample_generation_params)
+
+            metrics[&#39;rtf&#39;] = rtf
+            metrics = average(metrics)
+
+        flashy.distrib.barrier()
+        return metrics
+
+    def generate(self) -&gt; dict:
+        &#34;&#34;&#34;Generate stage.&#34;&#34;&#34;
+        self.model.eval()
+        with torch.no_grad():
+            return self.generate_audio()
+
+    def run_epoch(self):
+        if self.cfg.cache.write:
+            if ((self.epoch - 1) % self.cfg.cache.write_num_shards) != self.cfg.cache.write_shard:
+                return
+        super().run_epoch()
+
+    def train(self):
+        &#34;&#34;&#34;Train stage.
+        &#34;&#34;&#34;
+        if self._cached_batch_writer is not None:
+            self._cached_batch_writer.start_epoch(self.epoch)
+        if self._cached_batch_loader is None:
+            dataset = get_dataset_from_loader(self.dataloaders[&#39;train&#39;])
+            assert isinstance(dataset, AudioDataset)
+            dataset.current_epoch = self.epoch
+        else:
+            self._cached_batch_loader.start_epoch(self.epoch)
+        return super().train()
+
+    def evaluate_audio_generation(self) -&gt; dict:
+        &#34;&#34;&#34;Evaluate audio generation with off-the-shelf metrics.&#34;&#34;&#34;
+        evaluate_stage_name = f&#39;{self.current_stage}_generation&#39;
+        # instantiate evaluation metrics, if at least one metric is defined, run audio generation evaluation
+        fad: tp.Optional[eval_metrics.FrechetAudioDistanceMetric] = None
+        kldiv: tp.Optional[eval_metrics.KLDivergenceMetric] = None
+        text_consistency: tp.Optional[eval_metrics.TextConsistencyMetric] = None
+        chroma_cosine: tp.Optional[eval_metrics.ChromaCosineSimilarityMetric] = None
+        should_run_eval = False
+        eval_chroma_wavs: tp.Optional[torch.Tensor] = None
+        if self.cfg.evaluate.metrics.fad:
+            fad = builders.get_fad(self.cfg.metrics.fad).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.kld:
+            kldiv = builders.get_kldiv(self.cfg.metrics.kld).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.text_consistency:
+            text_consistency = builders.get_text_consistency(self.cfg.metrics.text_consistency).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.chroma_cosine:
+            chroma_cosine = builders.get_chroma_cosine_similarity(self.cfg.metrics.chroma_cosine).to(self.device)
+            # if we have predefind wavs for chroma we should purge them for computing the cosine metric
+            has_predefined_eval_chromas = &#39;self_wav&#39; in self.model.condition_provider.conditioners and \
+                                          self.model.condition_provider.conditioners[&#39;self_wav&#39;].has_eval_wavs()
+            if has_predefined_eval_chromas:
+                warn_once(self.logger, &#34;Attempting to run cosine eval for config with pre-defined eval chromas! &#34;
+                                       &#39;Resetting eval chromas to None for evaluation.&#39;)
+                eval_chroma_wavs = self.model.condition_provider.conditioners.self_wav.eval_wavs  # type: ignore
+                self.model.condition_provider.conditioners.self_wav.reset_eval_wavs(None)  # type: ignore
+            should_run_eval = True
+
+        def get_compressed_audio(audio: torch.Tensor) -&gt; torch.Tensor:
+            audio_tokens, scale = self.compression_model.encode(audio.to(self.device))
+            compressed_audio = self.compression_model.decode(audio_tokens, scale)
+            return compressed_audio[..., :audio.shape[-1]]
+
+        metrics: dict = {}
+        if should_run_eval:
+            loader = self.dataloaders[&#39;evaluate&#39;]
+            updates = len(loader)
+            lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+            average = flashy.averager()
+            dataset = get_dataset_from_loader(loader)
+            assert isinstance(dataset, AudioDataset)
+            self.logger.info(f&#34;Computing evaluation metrics on {len(dataset)} samples&#34;)
+
+            for idx, batch in enumerate(lp):
+                audio, meta = batch
+                assert all([self.cfg.sample_rate == m.sample_rate for m in meta])
+
+                target_duration = audio.shape[-1] / self.cfg.sample_rate
+                if self.cfg.evaluate.fixed_generation_duration:
+                    target_duration = self.cfg.evaluate.fixed_generation_duration
+
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration,
+                    **self.generation_params
+                )
+                y_pred = gen_outputs[&#39;gen_audio&#39;].detach()
+                y_pred = y_pred[..., :audio.shape[-1]]
+
+                normalize_kwargs = dict(self.cfg.generate.audio)
+                normalize_kwargs.pop(&#39;format&#39;, None)
+                y_pred = torch.stack([normalize_audio(w, **normalize_kwargs) for w in y_pred], dim=0).cpu()
+                y = audio.cpu()  # should already be on CPU but just in case
+                sizes = torch.tensor([m.n_frames for m in meta])  # actual sizes without padding
+                sample_rates = torch.tensor([m.sample_rate for m in meta])  # sample rates for audio samples
+                audio_stems = [Path(m.meta.path).stem + f&#34;_{m.seek_time}&#34; for m in meta]
+
+                if fad is not None:
+                    if self.cfg.metrics.fad.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    fad.update(y_pred, y, sizes, sample_rates, audio_stems)
+                if kldiv is not None:
+                    if self.cfg.metrics.kld.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    kldiv.update(y_pred, y, sizes, sample_rates)
+                if text_consistency is not None:
+                    texts = [m.description for m in meta]
+                    if self.cfg.metrics.text_consistency.use_gt:
+                        y_pred = y
+                    text_consistency.update(y_pred, texts, sizes, sample_rates)
+                if chroma_cosine is not None:
+                    if self.cfg.metrics.chroma_cosine.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    chroma_cosine.update(y_pred, y, sizes, sample_rates)
+                    # restore chroma conditioner&#39;s eval chroma wavs
+                    if eval_chroma_wavs is not None:
+                        self.model.condition_provider.conditioners[&#39;self_wav&#39;].reset_eval_wavs(eval_chroma_wavs)
+
+            flashy.distrib.barrier()
+            if fad is not None:
+                metrics[&#39;fad&#39;] = fad.compute()
+            if kldiv is not None:
+                kld_metrics = kldiv.compute()
+                metrics.update(kld_metrics)
+            if text_consistency is not None:
+                metrics[&#39;text_consistency&#39;] = text_consistency.compute()
+            if chroma_cosine is not None:
+                metrics[&#39;chroma_cosine&#39;] = chroma_cosine.compute()
+            metrics = average(metrics)
+            metrics = flashy.distrib.average_metrics(metrics, len(loader))
+
+        return metrics
+
+    def evaluate(self) -&gt; dict:
+        &#34;&#34;&#34;Evaluate stage.&#34;&#34;&#34;
+        self.model.eval()
+        with torch.no_grad():
+            metrics: dict = {}
+            if self.cfg.evaluate.metrics.base:
+                metrics.update(self.common_train_valid(&#39;evaluate&#39;))
+            gen_metrics = self.evaluate_audio_generation()
+            return {**metrics, **gen_metrics}</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></li>
+<li>abc.ABC</li>
+<li>flashy.solver.BaseSolver</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.solvers.audiogen.AudioGenSolver" href="audiogen.html#audiocraft.solvers.audiogen.AudioGenSolver">AudioGenSolver</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.DATASET_TYPE"><code class="name">var <span class="ident">DATASET_TYPE</span> : <a title="audiocraft.solvers.builders.DatasetType" href="builders.html#audiocraft.solvers.builders.DatasetType">DatasetType</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.get_eval_solver_from_sig"><code class="name flex">
+<span>def <span class="ident">get_eval_solver_from_sig</span></span>(<span>sig: str, dtype: Optional[str] = None, device: Optional[str] = None, autocast: bool = True, batch_size: Optional[int] = None, override_cfg: Union[dict, omegaconf.dictconfig.DictConfig, None] = None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Mostly a convenience function around magma.train.get_solver_from_sig,
+populating all the proper param, deactivating EMA, FSDP, loading the best state,
+basically all you need to get a solver ready to "play" with in single GPU mode
+and with minimal memory overhead.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sig</code></strong> :&ensp;<code>str</code></dt>
+<dd>signature to load.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>str</code> or <code>None</code></dt>
+<dd>potential dtype, as a string, i.e. 'float16'.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code> or <code>None</code></dt>
+<dd>potential device, as a string, i.e. 'cuda'.</dd>
+<dt><strong><code>override_cfg</code></strong> :&ensp;<code>dict</code> or <code>omegaconf.DictConfig</code> or <code>None</code></dt>
+<dd>potential device, as a string, i.e. 'cuda'.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                             device: tp.Optional[str] = None, autocast: bool = True,
+                             batch_size: tp.Optional[int] = None,
+                             override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                             **kwargs):
+    &#34;&#34;&#34;Mostly a convenience function around magma.train.get_solver_from_sig,
+    populating all the proper param, deactivating EMA, FSDP, loading the best state,
+    basically all you need to get a solver ready to &#34;play&#34; with in single GPU mode
+    and with minimal memory overhead.
+
+    Args:
+        sig (str): signature to load.
+        dtype (str or None): potential dtype, as a string, i.e. &#39;float16&#39;.
+        device (str or None): potential device, as a string, i.e. &#39;cuda&#39;.
+        override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. &#39;cuda&#39;.
+    &#34;&#34;&#34;
+    from audiocraft import train
+    our_override_cfg: tp.Dict[str, tp.Any] = {&#39;optim&#39;: {&#39;ema&#39;: {&#39;use&#39;: False}}}
+    our_override_cfg[&#39;autocast&#39;] = autocast
+    if dtype is not None:
+        our_override_cfg[&#39;dtype&#39;] = dtype
+    if device is not None:
+        our_override_cfg[&#39;device&#39;] = device
+    if batch_size is not None:
+        our_override_cfg[&#39;dataset&#39;] = {&#39;batch_size&#39;: batch_size}
+    if override_cfg is None:
+        override_cfg = {}
+    override_cfg = omegaconf.OmegaConf.merge(
+        omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+    solver = train.get_solver_from_sig(
+        sig, override_cfg=override_cfg,
+        load_best=True, disable_fsdp=True,
+        ignore_state_keys=[&#39;optimizer&#39;, &#39;ema&#39;], **kwargs)
+    solver.model.eval()
+    return solver</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.build_dataloaders"><code class="name flex">
+<span>def <span class="ident">build_dataloaders</span></span>(<span>self) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate audio dataloaders for each stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_dataloaders(self) -&gt; None:
+    &#34;&#34;&#34;Instantiate audio dataloaders for each stage.&#34;&#34;&#34;
+    self.dataloaders = builders.get_audio_datasets(self.cfg, dataset_type=self.DATASET_TYPE)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.build_model"><code class="name flex">
+<span>def <span class="ident">build_model</span></span>(<span>self) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate models and optimizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_model(self) -&gt; None:
+    &#34;&#34;&#34;Instantiate models and optimizer.&#34;&#34;&#34;
+    # we can potentially not use all quantizers with which the EnCodec model was trained
+    # (e.g. we trained the model with quantizers dropout)
+    self.compression_model = CompressionSolver.wrapped_model_from_checkpoint(
+        self.cfg, self.cfg.compression_model_checkpoint, device=self.device)
+    assert self.compression_model.sample_rate == self.cfg.sample_rate, (
+        f&#34;Compression model sample rate is {self.compression_model.sample_rate} but &#34;
+        f&#34;Solver sample rate is {self.cfg.sample_rate}.&#34;
+        )
+    # ensure we have matching configuration between LM and compression model
+    assert self.cfg.transformer_lm.card == self.compression_model.cardinality, (
+        &#34;Cardinalities of the LM and compression model don&#39;t match: &#34;,
+        f&#34;LM cardinality is {self.cfg.transformer_lm.card} vs &#34;,
+        f&#34;compression model cardinality is {self.compression_model.cardinality}&#34;
+    )
+    assert self.cfg.transformer_lm.n_q == self.compression_model.num_codebooks, (
+        &#34;Numbers of codebooks of the LM and compression models don&#39;t match: &#34;,
+        f&#34;LM number of codebooks is {self.cfg.transformer_lm.n_q} vs &#34;,
+        f&#34;compression model numer of codebooks is {self.compression_model.num_codebooks}&#34;
+    )
+    self.logger.info(&#34;Compression model has %d codebooks with %d cardinality, and a framerate of %d&#34;,
+                     self.compression_model.num_codebooks, self.compression_model.cardinality,
+                     self.compression_model.frame_rate)
+    # instantiate LM model
+    self.model: models.LMModel = models.builders.get_lm_model(self.cfg).to(self.device)
+    if self.cfg.fsdp.use:
+        assert not self.cfg.autocast, &#34;Cannot use autocast with fsdp&#34;
+        self.model = self.wrap_with_fsdp(self.model)
+    self.register_ema(&#39;model&#39;)
+    # initialize optimization
+    self.optimizer = builders.get_optimizer(builders.get_optim_parameter_groups(self.model), self.cfg.optim)
+    self.lr_scheduler = builders.get_lr_scheduler(self.optimizer, self.cfg.schedule, self.total_updates)
+    self.register_stateful(&#39;model&#39;, &#39;optimizer&#39;, &#39;lr_scheduler&#39;)
+    self.register_best_state(&#39;model&#39;)
+    self.autocast_dtype = {
+        &#39;float16&#39;: torch.float16, &#39;bfloat16&#39;: torch.bfloat16
+    }[self.cfg.autocast_dtype]
+    self.scaler: tp.Optional[torch.cuda.amp.GradScaler] = None
+    if self.cfg.fsdp.use:
+        need_scaler = self.cfg.fsdp.param_dtype == &#39;float16&#39;
+    else:
+        need_scaler = self.cfg.autocast and self.autocast_dtype is torch.float16
+    if need_scaler:
+        if self.cfg.fsdp.use:
+            from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+            self.scaler = ShardedGradScaler()  # type: ignore
+        else:
+            self.scaler = torch.cuda.amp.GradScaler()
+        self.register_stateful(&#39;scaler&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.evaluate_audio_generation"><code class="name flex">
+<span>def <span class="ident">evaluate_audio_generation</span></span>(<span>self) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Evaluate audio generation with off-the-shelf metrics.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def evaluate_audio_generation(self) -&gt; dict:
+    &#34;&#34;&#34;Evaluate audio generation with off-the-shelf metrics.&#34;&#34;&#34;
+    evaluate_stage_name = f&#39;{self.current_stage}_generation&#39;
+    # instantiate evaluation metrics, if at least one metric is defined, run audio generation evaluation
+    fad: tp.Optional[eval_metrics.FrechetAudioDistanceMetric] = None
+    kldiv: tp.Optional[eval_metrics.KLDivergenceMetric] = None
+    text_consistency: tp.Optional[eval_metrics.TextConsistencyMetric] = None
+    chroma_cosine: tp.Optional[eval_metrics.ChromaCosineSimilarityMetric] = None
+    should_run_eval = False
+    eval_chroma_wavs: tp.Optional[torch.Tensor] = None
+    if self.cfg.evaluate.metrics.fad:
+        fad = builders.get_fad(self.cfg.metrics.fad).to(self.device)
+        should_run_eval = True
+    if self.cfg.evaluate.metrics.kld:
+        kldiv = builders.get_kldiv(self.cfg.metrics.kld).to(self.device)
+        should_run_eval = True
+    if self.cfg.evaluate.metrics.text_consistency:
+        text_consistency = builders.get_text_consistency(self.cfg.metrics.text_consistency).to(self.device)
+        should_run_eval = True
+    if self.cfg.evaluate.metrics.chroma_cosine:
+        chroma_cosine = builders.get_chroma_cosine_similarity(self.cfg.metrics.chroma_cosine).to(self.device)
+        # if we have predefind wavs for chroma we should purge them for computing the cosine metric
+        has_predefined_eval_chromas = &#39;self_wav&#39; in self.model.condition_provider.conditioners and \
+                                      self.model.condition_provider.conditioners[&#39;self_wav&#39;].has_eval_wavs()
+        if has_predefined_eval_chromas:
+            warn_once(self.logger, &#34;Attempting to run cosine eval for config with pre-defined eval chromas! &#34;
+                                   &#39;Resetting eval chromas to None for evaluation.&#39;)
+            eval_chroma_wavs = self.model.condition_provider.conditioners.self_wav.eval_wavs  # type: ignore
+            self.model.condition_provider.conditioners.self_wav.reset_eval_wavs(None)  # type: ignore
+        should_run_eval = True
+
+    def get_compressed_audio(audio: torch.Tensor) -&gt; torch.Tensor:
+        audio_tokens, scale = self.compression_model.encode(audio.to(self.device))
+        compressed_audio = self.compression_model.decode(audio_tokens, scale)
+        return compressed_audio[..., :audio.shape[-1]]
+
+    metrics: dict = {}
+    if should_run_eval:
+        loader = self.dataloaders[&#39;evaluate&#39;]
+        updates = len(loader)
+        lp = self.log_progress(f&#39;{evaluate_stage_name} inference&#39;, loader, total=updates, updates=self.log_updates)
+        average = flashy.averager()
+        dataset = get_dataset_from_loader(loader)
+        assert isinstance(dataset, AudioDataset)
+        self.logger.info(f&#34;Computing evaluation metrics on {len(dataset)} samples&#34;)
+
+        for idx, batch in enumerate(lp):
+            audio, meta = batch
+            assert all([self.cfg.sample_rate == m.sample_rate for m in meta])
+
+            target_duration = audio.shape[-1] / self.cfg.sample_rate
+            if self.cfg.evaluate.fixed_generation_duration:
+                target_duration = self.cfg.evaluate.fixed_generation_duration
+
+            gen_outputs = self.run_generate_step(
+                batch, gen_duration=target_duration,
+                **self.generation_params
+            )
+            y_pred = gen_outputs[&#39;gen_audio&#39;].detach()
+            y_pred = y_pred[..., :audio.shape[-1]]
+
+            normalize_kwargs = dict(self.cfg.generate.audio)
+            normalize_kwargs.pop(&#39;format&#39;, None)
+            y_pred = torch.stack([normalize_audio(w, **normalize_kwargs) for w in y_pred], dim=0).cpu()
+            y = audio.cpu()  # should already be on CPU but just in case
+            sizes = torch.tensor([m.n_frames for m in meta])  # actual sizes without padding
+            sample_rates = torch.tensor([m.sample_rate for m in meta])  # sample rates for audio samples
+            audio_stems = [Path(m.meta.path).stem + f&#34;_{m.seek_time}&#34; for m in meta]
+
+            if fad is not None:
+                if self.cfg.metrics.fad.use_gt:
+                    y_pred = get_compressed_audio(y).cpu()
+                fad.update(y_pred, y, sizes, sample_rates, audio_stems)
+            if kldiv is not None:
+                if self.cfg.metrics.kld.use_gt:
+                    y_pred = get_compressed_audio(y).cpu()
+                kldiv.update(y_pred, y, sizes, sample_rates)
+            if text_consistency is not None:
+                texts = [m.description for m in meta]
+                if self.cfg.metrics.text_consistency.use_gt:
+                    y_pred = y
+                text_consistency.update(y_pred, texts, sizes, sample_rates)
+            if chroma_cosine is not None:
+                if self.cfg.metrics.chroma_cosine.use_gt:
+                    y_pred = get_compressed_audio(y).cpu()
+                chroma_cosine.update(y_pred, y, sizes, sample_rates)
+                # restore chroma conditioner&#39;s eval chroma wavs
+                if eval_chroma_wavs is not None:
+                    self.model.condition_provider.conditioners[&#39;self_wav&#39;].reset_eval_wavs(eval_chroma_wavs)
+
+        flashy.distrib.barrier()
+        if fad is not None:
+            metrics[&#39;fad&#39;] = fad.compute()
+        if kldiv is not None:
+            kld_metrics = kldiv.compute()
+            metrics.update(kld_metrics)
+        if text_consistency is not None:
+            metrics[&#39;text_consistency&#39;] = text_consistency.compute()
+        if chroma_cosine is not None:
+            metrics[&#39;chroma_cosine&#39;] = chroma_cosine.compute()
+        metrics = average(metrics)
+        metrics = flashy.distrib.average_metrics(metrics, len(loader))
+
+    return metrics</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.generate_audio"><code class="name flex">
+<span>def <span class="ident">generate_audio</span></span>(<span>self) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Audio generation stage.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_audio(self) -&gt; dict:
+    &#34;&#34;&#34;Audio generation stage.&#34;&#34;&#34;
+    generate_stage_name = f&#39;{self.current_stage}&#39;
+    sample_manager = SampleManager(self.xp)
+    self.logger.info(f&#34;Generating samples in {sample_manager.base_folder}&#34;)
+    loader = self.dataloaders[&#39;generate&#39;]
+    updates = len(loader)
+    lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+
+    dataset = get_dataset_from_loader(loader)
+    dataset_duration = dataset.segment_duration
+    assert dataset_duration is not None
+    assert isinstance(dataset, AudioDataset)
+    target_duration = self.cfg.generate.lm.gen_duration
+    prompt_duration = self.cfg.generate.lm.prompt_duration
+    if target_duration is None:
+        target_duration = dataset_duration
+    if prompt_duration is None:
+        prompt_duration = dataset_duration / 4
+    assert prompt_duration &lt; dataset_duration, (
+        f&#34;Specified prompt duration ({prompt_duration}s) is longer&#34;,
+        f&#34; than reference audio duration ({dataset_duration}s)&#34;
+    )
+
+    def get_hydrated_conditions(meta: tp.List[SegmentWithAttributes]):
+        hydrated_conditions = []
+        for sample in [x.to_condition_attributes() for x in meta]:
+            cond_dict = {}
+            for cond_type in sample.__annotations__.keys():
+                for cond_key, cond_val in getattr(sample, cond_type).items():
+                    if cond_key not in self.model.condition_provider.conditioners.keys():
+                        continue
+                    if is_jsonable(cond_val):
+                        cond_dict[cond_key] = cond_val
+                    elif isinstance(cond_val, WavCondition):
+                        cond_dict[cond_key] = cond_val.path
+                    elif isinstance(cond_val, JointEmbedCondition):
+                        cond_dict[cond_key] = cond_val.text  # only support text at inference for now
+                    else:
+                        # if we reached this point, it is not clear how to log the condition
+                        # so we just log the type.
+                        cond_dict[cond_key] = str(type(cond_val))
+                        continue
+            hydrated_conditions.append(cond_dict)
+        return hydrated_conditions
+
+    metrics: dict = {}
+    average = flashy.averager()
+    for batch in lp:
+        audio, meta = batch
+        # metadata for sample manager
+        hydrated_conditions = get_hydrated_conditions(meta)
+        sample_generation_params = {
+            **{f&#39;classifier_free_guidance_{k}&#39;: v for k, v in self.cfg.classifier_free_guidance.items()},
+            **self.generation_params
+        }
+        if self.cfg.generate.lm.unprompted_samples:
+            if self.cfg.generate.lm.gen_gt_samples:
+                # get the ground truth instead of generation
+                self.logger.warn(
+                    &#34;Use ground truth instead of audio generation as generate.lm.gen_gt_samples=true&#34;)
+                gen_unprompted_audio = audio
+                rtf = 1.
+            else:
+                gen_unprompted_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration, prompt_duration=None,
+                    **self.generation_params)
+                gen_unprompted_audio = gen_unprompted_outputs[&#39;gen_audio&#39;].cpu()
+                rtf = gen_unprompted_outputs[&#39;rtf&#39;]
+            sample_manager.add_samples(
+                gen_unprompted_audio, self.epoch, hydrated_conditions,
+                ground_truth_wavs=audio, generation_args=sample_generation_params)
+
+        if self.cfg.generate.lm.prompted_samples:
+            gen_outputs = self.run_generate_step(
+                batch, gen_duration=target_duration, prompt_duration=prompt_duration,
+                **self.generation_params)
+            gen_audio = gen_outputs[&#39;gen_audio&#39;].cpu()
+            prompt_audio = gen_outputs[&#39;prompt_audio&#39;].cpu()
+            sample_manager.add_samples(
+                gen_audio, self.epoch, hydrated_conditions,
+                prompt_wavs=prompt_audio, ground_truth_wavs=audio,
+                generation_args=sample_generation_params)
+
+        metrics[&#39;rtf&#39;] = rtf
+        metrics = average(metrics)
+
+    flashy.distrib.barrier()
+    return metrics</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.get_formatter"><code class="name flex">
+<span>def <span class="ident">get_formatter</span></span>(<span>self, stage_name: str) ‑> flashy.formatter.Formatter</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_formatter(self, stage_name: str) -&gt; flashy.Formatter:
+    return flashy.Formatter({
+        &#39;lr&#39;: &#39;.2E&#39;,
+        &#39;ce&#39;: &#39;.3f&#39;,
+        &#39;ppl&#39;: &#39;.3f&#39;,
+        &#39;grad_norm&#39;: &#39;.3E&#39;,
+    }, exclude_keys=[&#39;ce_q*&#39;, &#39;ppl_q*&#39;])</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.load_from_pretrained"><code class="name flex">
+<span>def <span class="ident">load_from_pretrained</span></span>(<span>self, name: str)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_from_pretrained(self, name: str):
+    # TODO: support native HF versions of MusicGen.
+    lm_pkg = models.loaders.load_lm_model_ckpt(name)
+    state: dict = {
+        &#39;best_state&#39;: {
+            &#39;model&#39;: lm_pkg[&#39;best_state&#39;],
+        },
+    }
+    return state</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.load_state_dict"><code class="name flex">
+<span>def <span class="ident">load_state_dict</span></span>(<span>self, state: dict) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_state_dict(self, state: dict) -&gt; None:
+    if &#39;condition_provider&#39; in state:
+        model_state = state[&#39;model&#39;]
+        condition_provider_state = state.pop(&#39;condition_provider&#39;)
+        prefix = &#39;condition_provider.&#39;
+        for key, value in condition_provider_state.items():
+            key = prefix + key
+            assert key not in model_state
+            model_state[key] = value
+    if &#39;compression_model&#39; in state:
+        # We used to store the `compression_model` state in the checkpoint, however
+        # this is in general not needed, as the compression model should always be readable
+        # from the original `cfg.compression_model_checkpoint` location.
+        compression_model_state = state.pop(&#39;compression_model&#39;)
+        before_hash = model_hash(self.compression_model)
+        self.compression_model.load_state_dict(compression_model_state)
+        after_hash = model_hash(self.compression_model)
+        if before_hash != after_hash:
+            raise RuntimeError(
+                &#34;The compression model state inside the checkpoint is different&#34;
+                &#34; from the one obtained from compression_model_checkpoint...&#34;
+                &#34;We do not support altering the compression model inside the LM &#34;
+                &#34;checkpoint as parts of the code, in particular for running eval post-training &#34;
+                &#34;will use the compression_model_checkpoint as the source of truth.&#34;)
+
+    super().load_state_dict(state)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.run_generate_step"><code class="name flex">
+<span>def <span class="ident">run_generate_step</span></span>(<span>self, batch: Tuple[torch.Tensor, List[<a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a>]], gen_duration: float, prompt_duration: Optional[float] = None, remove_prompt: bool = False, **generation_params) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Run generate step on a batch of optional audio tensor and corresponding attributes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt>batch (tuple[torch.Tensor, list[SegmentWithAttributes]]):</dt>
+<dt><strong><code>use_prompt</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to do audio continuation generation with prompt from audio batch.</dd>
+<dt><strong><code>gen_duration</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target audio duration for the generation.</dd>
+<dt><strong><code>prompt_duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Duration for the audio prompt to use for continuation.</dd>
+<dt><strong><code>remove_prompt</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to remove the prompt from the generated audio.</dd>
+<dt><strong><code>generation_params</code></strong></dt>
+<dd>Additional generation parameters.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>gen_outputs (dict): Generation outputs, consisting in audio, audio tokens from both the generation
+and the prompt along with additional information.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def run_generate_step(self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+                      gen_duration: float, prompt_duration: tp.Optional[float] = None,
+                      remove_prompt: bool = False,
+                      **generation_params) -&gt; dict:
+    &#34;&#34;&#34;Run generate step on a batch of optional audio tensor and corresponding attributes.
+
+    Args:
+        batch (tuple[torch.Tensor, list[SegmentWithAttributes]]):
+        use_prompt (bool): Whether to do audio continuation generation with prompt from audio batch.
+        gen_duration (float): Target audio duration for the generation.
+        prompt_duration (float, optional): Duration for the audio prompt to use for continuation.
+        remove_prompt (bool, optional): Whether to remove the prompt from the generated audio.
+        generation_params: Additional generation parameters.
+    Returns:
+        gen_outputs (dict): Generation outputs, consisting in audio, audio tokens from both the generation
+            and the prompt along with additional information.
+    &#34;&#34;&#34;
+    bench_start = time.time()
+    audio, meta = batch
+    assert audio.size(0) == len(meta), (
+        f&#34;Mismatch between number of items in audio batch ({audio.size(0)})&#34;,
+        f&#34; and in metadata ({len(meta)})&#34;
+    )
+    # prepare attributes
+    attributes = [x.to_condition_attributes() for x in meta]
+    # TODO: Add dropout for chroma?
+
+    # prepare audio prompt
+    if prompt_duration is None:
+        prompt_audio = None
+    else:
+        assert prompt_duration &lt; gen_duration, &#34;Prompt duration must be lower than target generation duration&#34;
+        prompt_audio_frames = int(prompt_duration * self.compression_model.sample_rate)
+        prompt_audio = audio[..., :prompt_audio_frames]
+
+    # get audio tokens from compression model
+    if prompt_audio is None or prompt_audio.nelement() == 0:
+        num_samples = len(attributes)
+        prompt_tokens = None
+    else:
+        num_samples = None
+        prompt_audio = prompt_audio.to(self.device)
+        prompt_tokens, scale = self.compression_model.encode(prompt_audio)
+        assert scale is None, &#34;Compression model in MusicGen should not require rescaling.&#34;
+
+    # generate by sampling from the LM
+    with self.autocast:
+        total_gen_len = math.ceil(gen_duration * self.compression_model.frame_rate)
+        gen_tokens = self.model.generate(
+            prompt_tokens, attributes, max_gen_len=total_gen_len,
+            num_samples=num_samples, **self.generation_params)
+
+    # generate audio from tokens
+    assert gen_tokens.dim() == 3
+    gen_audio = self.compression_model.decode(gen_tokens, None)
+
+    bench_end = time.time()
+    gen_outputs = {
+        &#39;rtf&#39;: (bench_end - bench_start) / gen_duration,
+        &#39;ref_audio&#39;: audio,
+        &#39;gen_audio&#39;: gen_audio,
+        &#39;gen_tokens&#39;: gen_tokens,
+        &#39;prompt_audio&#39;: prompt_audio,
+        &#39;prompt_tokens&#39;: prompt_tokens,
+    }
+    return gen_outputs</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.solvers.musicgen.MusicGenSolver.show"><code class="name flex">
+<span>def <span class="ident">show</span></span>(<span>self) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Show the compression model and LM model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def show(self) -&gt; None:
+    &#34;&#34;&#34;Show the compression model and LM model.&#34;&#34;&#34;
+    self.logger.info(&#34;Compression model:&#34;)
+    self.log_model_summary(self.compression_model)
+    self.logger.info(&#34;LM model:&#34;)
+    self.log_model_summary(self.model)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.solvers.base.StandardSolver" href="base.html#audiocraft.solvers.base.StandardSolver">StandardSolver</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.solvers.base.StandardSolver.autocast" href="base.html#audiocraft.solvers.base.StandardSolver.autocast">autocast</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.best_metric_name" href="base.html#audiocraft.solvers.base.StandardSolver.best_metric_name">best_metric_name</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.commit" href="base.html#audiocraft.solvers.base.StandardSolver.commit">commit</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.common_train_valid" href="base.html#audiocraft.solvers.base.StandardSolver.common_train_valid">common_train_valid</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.evaluate" href="base.html#audiocraft.solvers.base.StandardSolver.evaluate">evaluate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.generate" href="base.html#audiocraft.solvers.base.StandardSolver.generate">generate</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.initialize_ema" href="base.html#audiocraft.solvers.base.StandardSolver.initialize_ema">initialize_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.load_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.load_checkpoints">load_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.log_model_summary" href="base.html#audiocraft.solvers.base.StandardSolver.log_model_summary">log_model_summary</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_best_state" href="base.html#audiocraft.solvers.base.StandardSolver.register_best_state">register_best_state</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.register_ema" href="base.html#audiocraft.solvers.base.StandardSolver.register_ema">register_ema</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.restore" href="base.html#audiocraft.solvers.base.StandardSolver.restore">restore</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run" href="base.html#audiocraft.solvers.base.StandardSolver.run">run</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_epoch" href="base.html#audiocraft.solvers.base.StandardSolver.run_epoch">run_epoch</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_one_stage" href="base.html#audiocraft.solvers.base.StandardSolver.run_one_stage">run_one_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.run_step" href="base.html#audiocraft.solvers.base.StandardSolver.run_step">run_step</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.save_checkpoints" href="base.html#audiocraft.solvers.base.StandardSolver.save_checkpoints">save_checkpoints</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_run_stage" href="base.html#audiocraft.solvers.base.StandardSolver.should_run_stage">should_run_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.should_stop_training" href="base.html#audiocraft.solvers.base.StandardSolver.should_stop_training">should_stop_training</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.train" href="base.html#audiocraft.solvers.base.StandardSolver.train">train</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.update_best_state_from_stage" href="base.html#audiocraft.solvers.base.StandardSolver.update_best_state_from_stage">update_best_state_from_stage</a></code></li>
+<li><code><a title="audiocraft.solvers.base.StandardSolver.valid" href="base.html#audiocraft.solvers.base.StandardSolver.valid">valid</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.solvers" href="index.html">audiocraft.solvers</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.solvers.musicgen.MusicGenSolver" href="#audiocraft.solvers.musicgen.MusicGenSolver">MusicGenSolver</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.DATASET_TYPE" href="#audiocraft.solvers.musicgen.MusicGenSolver.DATASET_TYPE">DATASET_TYPE</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.build_dataloaders" href="#audiocraft.solvers.musicgen.MusicGenSolver.build_dataloaders">build_dataloaders</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.build_model" href="#audiocraft.solvers.musicgen.MusicGenSolver.build_model">build_model</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.evaluate_audio_generation" href="#audiocraft.solvers.musicgen.MusicGenSolver.evaluate_audio_generation">evaluate_audio_generation</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.generate_audio" href="#audiocraft.solvers.musicgen.MusicGenSolver.generate_audio">generate_audio</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.get_eval_solver_from_sig" href="#audiocraft.solvers.musicgen.MusicGenSolver.get_eval_solver_from_sig">get_eval_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.get_formatter" href="#audiocraft.solvers.musicgen.MusicGenSolver.get_formatter">get_formatter</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.load_from_pretrained" href="#audiocraft.solvers.musicgen.MusicGenSolver.load_from_pretrained">load_from_pretrained</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.load_state_dict" href="#audiocraft.solvers.musicgen.MusicGenSolver.load_state_dict">load_state_dict</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.run_generate_step" href="#audiocraft.solvers.musicgen.MusicGenSolver.run_generate_step">run_generate_step</a></code></li>
+<li><code><a title="audiocraft.solvers.musicgen.MusicGenSolver.show" href="#audiocraft.solvers.musicgen.MusicGenSolver.show">show</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/train.html b/api_docs/audiocraft/train.html
new file mode 100644
index 00000000..f899c5bd
--- /dev/null
+++ b/api_docs/audiocraft/train.html
@@ -0,0 +1,404 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.train API documentation</title>
+<meta name="description" content="Entry point for dora to launch solvers for running training loops.
+See more info on how to use dora: https://github.com/facebookresearch/dora" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.train</code></h1>
+</header>
+<section id="section-intro">
+<p>Entry point for dora to launch solvers for running training loops.
+See more info on how to use dora: <a href="https://github.com/facebookresearch/dora">https://github.com/facebookresearch/dora</a></p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Entry point for dora to launch solvers for running training loops.
+See more info on how to use dora: https://github.com/facebookresearch/dora
+&#34;&#34;&#34;
+
+import logging
+import multiprocessing
+import os
+from pathlib import Path
+import sys
+import typing as tp
+
+from dora import git_save, hydra_main, XP
+import flashy
+import hydra
+import omegaconf
+
+from .environment import AudioCraftEnvironment
+from .utils.cluster import get_slurm_parameters
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_config_dset_paths(cfg):
+    &#34;&#34;&#34;Enable Dora to load manifest from git clone repository.&#34;&#34;&#34;
+    # manifest files for the different splits
+    for key, value in cfg.datasource.items():
+        if isinstance(value, str):
+            cfg.datasource[key] = git_save.to_absolute_path(value)
+
+
+def get_solver(cfg):
+    from . import solvers
+    # Convert batch size to batch size for each GPU
+    assert cfg.dataset.batch_size % flashy.distrib.world_size() == 0
+    cfg.dataset.batch_size //= flashy.distrib.world_size()
+    for split in [&#39;train&#39;, &#39;valid&#39;, &#39;evaluate&#39;, &#39;generate&#39;]:
+        if hasattr(cfg.dataset, split) and hasattr(cfg.dataset[split], &#39;batch_size&#39;):
+            assert cfg.dataset[split].batch_size % flashy.distrib.world_size() == 0
+            cfg.dataset[split].batch_size //= flashy.distrib.world_size()
+    resolve_config_dset_paths(cfg)
+    solver = solvers.get_solver(cfg)
+    return solver
+
+
+def get_solver_from_xp(xp: XP, override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                       restore: bool = True, load_best: bool = True,
+                       ignore_state_keys: tp.List[str] = [], disable_fsdp: bool = True):
+    &#34;&#34;&#34;Given a XP, return the Solver object.
+
+    Args:
+        xp (XP): Dora experiment for which to retrieve the solver.
+        override_cfg (dict or None): If not None, should be a dict used to
+            override some values in the config of `xp`. This will not impact
+            the XP signature or folder. The format is different
+            than the one used in Dora grids, nested keys should actually be nested dicts,
+            not flattened, e.g. `{&#39;optim&#39;: {&#39;batch_size&#39;: 32}}`.
+        restore (bool): If `True` (the default), restore state from the last checkpoint.
+        load_best (bool): If `True` (the default), load the best state from the checkpoint.
+        ignore_state_keys (list[str]): List of sources to ignore when loading the state, e.g. `optimizer`.
+        disable_fsdp (bool): if True, disables FSDP entirely. This will
+            also automatically skip loading the EMA. For solver specific
+            state sources, like the optimizer, you might want to
+            use along `ignore_state_keys=[&#39;optimizer&#39;]`. Must be used with `load_best=True`.
+    &#34;&#34;&#34;
+    logger.info(f&#34;Loading solver from XP {xp.sig}. &#34;
+                f&#34;Overrides used: {xp.argv}&#34;)
+    cfg = xp.cfg
+    if override_cfg is not None:
+        cfg = omegaconf.OmegaConf.merge(cfg, omegaconf.DictConfig(override_cfg))
+    if disable_fsdp and cfg.fsdp.use:
+        cfg.fsdp.use = False
+        assert load_best is True
+        # ignoring some keys that were FSDP sharded like model, ema, and best_state.
+        # fsdp_best_state will be used in that case. When using a specific solver,
+        # one is responsible for adding the relevant keys, e.g. &#39;optimizer&#39;.
+        # We could make something to automatically register those inside the solver, but that
+        # seem overkill at this point.
+        ignore_state_keys = ignore_state_keys + [&#39;model&#39;, &#39;ema&#39;, &#39;best_state&#39;]
+
+    try:
+        with xp.enter():
+            solver = get_solver(cfg)
+            if restore:
+                solver.restore(load_best=load_best, ignore_state_keys=ignore_state_keys)
+        return solver
+    finally:
+        hydra.core.global_hydra.GlobalHydra.instance().clear()
+
+
+def get_solver_from_sig(sig: str, *args, **kwargs):
+    &#34;&#34;&#34;Return Solver object from Dora signature, i.e. to play with it from a notebook.
+    See `get_solver_from_xp` for more information.
+    &#34;&#34;&#34;
+    xp = main.get_xp_from_sig(sig)
+    return get_solver_from_xp(xp, *args, **kwargs)
+
+
+def init_seed_and_system(cfg):
+    import numpy as np
+    import torch
+    import random
+    from audiocraft.modules.transformer import set_efficient_attention_backend
+
+    multiprocessing.set_start_method(cfg.mp_start_method)
+    logger.debug(&#39;Setting mp start method to %s&#39;, cfg.mp_start_method)
+    random.seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    # torch also initialize cuda seed if available
+    torch.manual_seed(cfg.seed)
+    torch.set_num_threads(cfg.num_threads)
+    os.environ[&#39;MKL_NUM_THREADS&#39;] = str(cfg.num_threads)
+    os.environ[&#39;OMP_NUM_THREADS&#39;] = str(cfg.num_threads)
+    logger.debug(&#39;Setting num threads to %d&#39;, cfg.num_threads)
+    set_efficient_attention_backend(cfg.efficient_attention_backend)
+    logger.debug(&#39;Setting efficient attention backend to %s&#39;, cfg.efficient_attention_backend)
+    if &#39;SLURM_JOB_ID&#39; in os.environ:
+        tmpdir = Path(&#39;/scratch/slurm_tmpdir/&#39; + os.environ[&#39;SLURM_JOB_ID&#39;])
+        if tmpdir.exists():
+            logger.info(&#34;Changing tmpdir to %s&#34;, tmpdir)
+            os.environ[&#39;TMPDIR&#39;] = str(tmpdir)
+
+
+@hydra_main(config_path=&#39;../config&#39;, config_name=&#39;config&#39;, version_base=&#39;1.1&#39;)
+def main(cfg):
+    init_seed_and_system(cfg)
+
+    # Setup logging both to XP specific folder, and to stderr.
+    log_name = &#39;%s.log.{rank}&#39; % cfg.execute_only if cfg.execute_only else &#39;solver.log.{rank}&#39;
+    flashy.setup_logging(level=str(cfg.logging.level).upper(), log_name=log_name)
+    # Initialize distributed training, no need to specify anything when using Dora.
+    flashy.distrib.init()
+    solver = get_solver(cfg)
+    if cfg.show:
+        solver.show()
+        return
+
+    if cfg.execute_only:
+        assert cfg.execute_inplace or cfg.continue_from is not None, \
+            &#34;Please explicitly specify the checkpoint to continue from with continue_from=&lt;sig_or_path&gt; &#34; + \
+            &#34;when running with execute_only or set execute_inplace to True.&#34;
+        solver.restore(replay_metrics=False)  # load checkpoint
+        solver.run_one_stage(cfg.execute_only)
+        return
+
+    return solver.run()
+
+
+main.dora.dir = AudioCraftEnvironment.get_dora_dir()
+main._base_cfg.slurm = get_slurm_parameters(main._base_cfg.slurm)
+
+if main.dora.shared is not None and not os.access(main.dora.shared, os.R_OK):
+    print(&#34;No read permission on dora.shared folder, ignoring it.&#34;, file=sys.stderr)
+    main.dora.shared = None
+
+if __name__ == &#39;__main__&#39;:
+    main()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.train.get_solver"><code class="name flex">
+<span>def <span class="ident">get_solver</span></span>(<span>cfg)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_solver(cfg):
+    from . import solvers
+    # Convert batch size to batch size for each GPU
+    assert cfg.dataset.batch_size % flashy.distrib.world_size() == 0
+    cfg.dataset.batch_size //= flashy.distrib.world_size()
+    for split in [&#39;train&#39;, &#39;valid&#39;, &#39;evaluate&#39;, &#39;generate&#39;]:
+        if hasattr(cfg.dataset, split) and hasattr(cfg.dataset[split], &#39;batch_size&#39;):
+            assert cfg.dataset[split].batch_size % flashy.distrib.world_size() == 0
+            cfg.dataset[split].batch_size //= flashy.distrib.world_size()
+    resolve_config_dset_paths(cfg)
+    solver = solvers.get_solver(cfg)
+    return solver</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.train.get_solver_from_sig"><code class="name flex">
+<span>def <span class="ident">get_solver_from_sig</span></span>(<span>sig: str, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return Solver object from Dora signature, i.e. to play with it from a notebook.
+See <code><a title="audiocraft.train.get_solver_from_xp" href="#audiocraft.train.get_solver_from_xp">get_solver_from_xp()</a></code> for more information.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_solver_from_sig(sig: str, *args, **kwargs):
+    &#34;&#34;&#34;Return Solver object from Dora signature, i.e. to play with it from a notebook.
+    See `get_solver_from_xp` for more information.
+    &#34;&#34;&#34;
+    xp = main.get_xp_from_sig(sig)
+    return get_solver_from_xp(xp, *args, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.train.get_solver_from_xp"><code class="name flex">
+<span>def <span class="ident">get_solver_from_xp</span></span>(<span>xp: dora.xp.XP, override_cfg: Union[dict, omegaconf.dictconfig.DictConfig, None] = None, restore: bool = True, load_best: bool = True, ignore_state_keys: List[str] = [], disable_fsdp: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Given a XP, return the Solver object.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>xp</code></strong> :&ensp;<code>XP</code></dt>
+<dd>Dora experiment for which to retrieve the solver.</dd>
+<dt><strong><code>override_cfg</code></strong> :&ensp;<code>dict</code> or <code>None</code></dt>
+<dd>If not None, should be a dict used to
+override some values in the config of <code>xp</code>. This will not impact
+the XP signature or folder. The format is different
+than the one used in Dora grids, nested keys should actually be nested dicts,
+not flattened, e.g. <code>{'optim': {'batch_size': 32}}</code>.</dd>
+<dt><strong><code>restore</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If <code>True</code> (the default), restore state from the last checkpoint.</dd>
+<dt><strong><code>load_best</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If <code>True</code> (the default), load the best state from the checkpoint.</dd>
+<dt><strong><code>ignore_state_keys</code></strong> :&ensp;<code>list[str]</code></dt>
+<dd>List of sources to ignore when loading the state, e.g. <code>optimizer</code>.</dd>
+<dt><strong><code>disable_fsdp</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if True, disables FSDP entirely. This will
+also automatically skip loading the EMA. For solver specific
+state sources, like the optimizer, you might want to
+use along <code>ignore_state_keys=['optimizer']</code>. Must be used with <code>load_best=True</code>.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_solver_from_xp(xp: XP, override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                       restore: bool = True, load_best: bool = True,
+                       ignore_state_keys: tp.List[str] = [], disable_fsdp: bool = True):
+    &#34;&#34;&#34;Given a XP, return the Solver object.
+
+    Args:
+        xp (XP): Dora experiment for which to retrieve the solver.
+        override_cfg (dict or None): If not None, should be a dict used to
+            override some values in the config of `xp`. This will not impact
+            the XP signature or folder. The format is different
+            than the one used in Dora grids, nested keys should actually be nested dicts,
+            not flattened, e.g. `{&#39;optim&#39;: {&#39;batch_size&#39;: 32}}`.
+        restore (bool): If `True` (the default), restore state from the last checkpoint.
+        load_best (bool): If `True` (the default), load the best state from the checkpoint.
+        ignore_state_keys (list[str]): List of sources to ignore when loading the state, e.g. `optimizer`.
+        disable_fsdp (bool): if True, disables FSDP entirely. This will
+            also automatically skip loading the EMA. For solver specific
+            state sources, like the optimizer, you might want to
+            use along `ignore_state_keys=[&#39;optimizer&#39;]`. Must be used with `load_best=True`.
+    &#34;&#34;&#34;
+    logger.info(f&#34;Loading solver from XP {xp.sig}. &#34;
+                f&#34;Overrides used: {xp.argv}&#34;)
+    cfg = xp.cfg
+    if override_cfg is not None:
+        cfg = omegaconf.OmegaConf.merge(cfg, omegaconf.DictConfig(override_cfg))
+    if disable_fsdp and cfg.fsdp.use:
+        cfg.fsdp.use = False
+        assert load_best is True
+        # ignoring some keys that were FSDP sharded like model, ema, and best_state.
+        # fsdp_best_state will be used in that case. When using a specific solver,
+        # one is responsible for adding the relevant keys, e.g. &#39;optimizer&#39;.
+        # We could make something to automatically register those inside the solver, but that
+        # seem overkill at this point.
+        ignore_state_keys = ignore_state_keys + [&#39;model&#39;, &#39;ema&#39;, &#39;best_state&#39;]
+
+    try:
+        with xp.enter():
+            solver = get_solver(cfg)
+            if restore:
+                solver.restore(load_best=load_best, ignore_state_keys=ignore_state_keys)
+        return solver
+    finally:
+        hydra.core.global_hydra.GlobalHydra.instance().clear()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.train.init_seed_and_system"><code class="name flex">
+<span>def <span class="ident">init_seed_and_system</span></span>(<span>cfg)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def init_seed_and_system(cfg):
+    import numpy as np
+    import torch
+    import random
+    from audiocraft.modules.transformer import set_efficient_attention_backend
+
+    multiprocessing.set_start_method(cfg.mp_start_method)
+    logger.debug(&#39;Setting mp start method to %s&#39;, cfg.mp_start_method)
+    random.seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    # torch also initialize cuda seed if available
+    torch.manual_seed(cfg.seed)
+    torch.set_num_threads(cfg.num_threads)
+    os.environ[&#39;MKL_NUM_THREADS&#39;] = str(cfg.num_threads)
+    os.environ[&#39;OMP_NUM_THREADS&#39;] = str(cfg.num_threads)
+    logger.debug(&#39;Setting num threads to %d&#39;, cfg.num_threads)
+    set_efficient_attention_backend(cfg.efficient_attention_backend)
+    logger.debug(&#39;Setting efficient attention backend to %s&#39;, cfg.efficient_attention_backend)
+    if &#39;SLURM_JOB_ID&#39; in os.environ:
+        tmpdir = Path(&#39;/scratch/slurm_tmpdir/&#39; + os.environ[&#39;SLURM_JOB_ID&#39;])
+        if tmpdir.exists():
+            logger.info(&#34;Changing tmpdir to %s&#34;, tmpdir)
+            os.environ[&#39;TMPDIR&#39;] = str(tmpdir)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.train.resolve_config_dset_paths"><code class="name flex">
+<span>def <span class="ident">resolve_config_dset_paths</span></span>(<span>cfg)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Enable Dora to load manifest from git clone repository.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def resolve_config_dset_paths(cfg):
+    &#34;&#34;&#34;Enable Dora to load manifest from git clone repository.&#34;&#34;&#34;
+    # manifest files for the different splits
+    for key, value in cfg.datasource.items():
+        if isinstance(value, str):
+            cfg.datasource[key] = git_save.to_absolute_path(value)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.train.get_solver" href="#audiocraft.train.get_solver">get_solver</a></code></li>
+<li><code><a title="audiocraft.train.get_solver_from_sig" href="#audiocraft.train.get_solver_from_sig">get_solver_from_sig</a></code></li>
+<li><code><a title="audiocraft.train.get_solver_from_xp" href="#audiocraft.train.get_solver_from_xp">get_solver_from_xp</a></code></li>
+<li><code><a title="audiocraft.train.init_seed_and_system" href="#audiocraft.train.init_seed_and_system">init_seed_and_system</a></code></li>
+<li><code><a title="audiocraft.train.resolve_config_dset_paths" href="#audiocraft.train.resolve_config_dset_paths">resolve_config_dset_paths</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/autocast.html b/api_docs/audiocraft/utils/autocast.html
new file mode 100644
index 00000000..bbf4554e
--- /dev/null
+++ b/api_docs/audiocraft/utils/autocast.html
@@ -0,0 +1,163 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.autocast API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.autocast</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class TorchAutocast:
+    &#34;&#34;&#34;TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    &#34;&#34;&#34;
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f&#34;There was an error autocasting with dtype={dtype} device={device}\n&#34;
+                &#34;If you are on the FAIR Cluster, you might need to use autocast_dtype=float16&#34;
+            )
+
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.autocast.TorchAutocast"><code class="flex name class">
+<span>class <span class="ident">TorchAutocast</span></span>
+<span>(</span><span>enabled: bool, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>TorchAutocast utility class.
+Allows you to enable and disable autocast. This is specially useful
+when dealing with different architectures and clusters with different
+levels of support.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>enabled</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to enable torch.autocast or not.</dd>
+<dt><strong><code>args</code></strong></dt>
+<dd>Additional args for torch.autocast.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional kwargs for torch.autocast</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TorchAutocast:
+    &#34;&#34;&#34;TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    &#34;&#34;&#34;
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f&#34;There was an error autocasting with dtype={dtype} device={device}\n&#34;
+                &#34;If you are on the FAIR Cluster, you might need to use autocast_dtype=float16&#34;
+            )
+
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.autocast.TorchAutocast" href="#audiocraft.utils.autocast.TorchAutocast">TorchAutocast</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/best_state.html b/api_docs/audiocraft/utils/best_state.html
new file mode 100644
index 00000000..b450a6cf
--- /dev/null
+++ b/api_docs/audiocraft/utils/best_state.html
@@ -0,0 +1,321 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.best_state API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.best_state</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+import typing as tp
+
+import flashy
+import torch
+
+from ..optim import ModuleDictEMA
+from .utils import copy_state
+
+
+logger = logging.getLogger(__name__)
+
+
+class BestStateDictManager(flashy.state.StateDictSource):
+    &#34;&#34;&#34;BestStateDictManager maintains a copy of best state_dict() for registered sources.
+
+    BestStateDictManager has two main attributes:
+        states (dict): State dict of the registered StateDictSource.
+        param_ids (dict): Dict of parameter ids for registered states from ModuleDictEMA and other sources.
+
+    When registering new sources, the BestStateDictManager will ensure two conflicting sources between
+    ModuleDictEMA and original modules are not both registered as it would otherwise create ambiguity about
+    what to consider for best state.
+
+    Args:
+        device (torch.device or str): Device on which we keep the copy.
+        dtype (torch.dtype): Data type for the state parameters.
+    &#34;&#34;&#34;
+    def __init__(self, device: tp.Union[torch.device, str] = &#39;cpu&#39;,
+                 dtype: tp.Optional[torch.dtype] = None):
+        self.device = device
+        self.states: dict = {}
+        self.param_ids: dict = defaultdict(dict)
+        self.dtype = dtype
+
+    def _get_parameter_ids(self, state_dict):
+        return {id(p): name for name, p in state_dict.items() if isinstance(p, torch.Tensor)}
+
+    def _validate_no_parameter_ids_overlap(self, name: str, param_ids: dict):
+        for registered_name, registered_param_ids in self.param_ids.items():
+            if registered_name != name:
+                overlap = set.intersection(registered_param_ids.keys(), param_ids.keys())
+                assert len(overlap) == 0, f&#34;Found {len(overlap)} / {len(param_ids.keys())} overlapping parameters&#34;
+                f&#34; in {name} and already registered {registered_name}: {&#39; &#39;.join(overlap)}&#34;
+
+    def update(self, name: str, source: flashy.state.StateDictSource):
+        if name not in self.states:
+            raise ValueError(f&#34;{name} missing from registered states.&#34;)
+        self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)
+
+    def register(self, name: str, source: flashy.state.StateDictSource):
+        if name in self.states:
+            raise ValueError(f&#34;{name} already present in states.&#34;)
+        # Registering parameter ids for EMA and non-EMA states allows us to check that
+        # there is no overlap that would create ambiguity about how to handle the best state
+        param_ids = self._get_parameter_ids(source.state_dict())
+        if isinstance(source, ModuleDictEMA):
+            logger.debug(f&#34;Registering to best state: ModuleDictEMA &#39;{name}&#39; with {len(param_ids)} params&#34;)
+            self._validate_no_parameter_ids_overlap(name, param_ids)
+            self.param_ids[name] = param_ids
+        else:
+            logger.debug(f&#34;Registering to best state: StateDictSource &#39;{name}&#39; with {len(param_ids)} params&#34;)
+            self._validate_no_parameter_ids_overlap(&#39;base&#39;, param_ids)
+            self.param_ids[&#39;base&#39;].update(param_ids)
+        # Register state
+        self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)
+
+    def state_dict(self) -&gt; flashy.state.StateDict:
+        return self.states
+
+    def load_state_dict(self, state: flashy.state.StateDict):
+        for name, sub_state in state.items():
+            for k, v in sub_state.items():
+                self.states[name][k].copy_(v)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.best_state.BestStateDictManager"><code class="flex name class">
+<span>class <span class="ident">BestStateDictManager</span></span>
+<span>(</span><span>device: Union[torch.device, str] = 'cpu', dtype: Optional[torch.dtype] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>BestStateDictManager maintains a copy of best state_dict() for registered sources.</p>
+<p>BestStateDictManager has two main attributes:
+states (dict): State dict of the registered StateDictSource.
+param_ids (dict): Dict of parameter ids for registered states from ModuleDictEMA and other sources.</p>
+<p>When registering new sources, the BestStateDictManager will ensure two conflicting sources between
+ModuleDictEMA and original modules are not both registered as it would otherwise create ambiguity about
+what to consider for best state.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>str</code></dt>
+<dd>Device on which we keep the copy.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code></dt>
+<dd>Data type for the state parameters.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BestStateDictManager(flashy.state.StateDictSource):
+    &#34;&#34;&#34;BestStateDictManager maintains a copy of best state_dict() for registered sources.
+
+    BestStateDictManager has two main attributes:
+        states (dict): State dict of the registered StateDictSource.
+        param_ids (dict): Dict of parameter ids for registered states from ModuleDictEMA and other sources.
+
+    When registering new sources, the BestStateDictManager will ensure two conflicting sources between
+    ModuleDictEMA and original modules are not both registered as it would otherwise create ambiguity about
+    what to consider for best state.
+
+    Args:
+        device (torch.device or str): Device on which we keep the copy.
+        dtype (torch.dtype): Data type for the state parameters.
+    &#34;&#34;&#34;
+    def __init__(self, device: tp.Union[torch.device, str] = &#39;cpu&#39;,
+                 dtype: tp.Optional[torch.dtype] = None):
+        self.device = device
+        self.states: dict = {}
+        self.param_ids: dict = defaultdict(dict)
+        self.dtype = dtype
+
+    def _get_parameter_ids(self, state_dict):
+        return {id(p): name for name, p in state_dict.items() if isinstance(p, torch.Tensor)}
+
+    def _validate_no_parameter_ids_overlap(self, name: str, param_ids: dict):
+        for registered_name, registered_param_ids in self.param_ids.items():
+            if registered_name != name:
+                overlap = set.intersection(registered_param_ids.keys(), param_ids.keys())
+                assert len(overlap) == 0, f&#34;Found {len(overlap)} / {len(param_ids.keys())} overlapping parameters&#34;
+                f&#34; in {name} and already registered {registered_name}: {&#39; &#39;.join(overlap)}&#34;
+
+    def update(self, name: str, source: flashy.state.StateDictSource):
+        if name not in self.states:
+            raise ValueError(f&#34;{name} missing from registered states.&#34;)
+        self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)
+
+    def register(self, name: str, source: flashy.state.StateDictSource):
+        if name in self.states:
+            raise ValueError(f&#34;{name} already present in states.&#34;)
+        # Registering parameter ids for EMA and non-EMA states allows us to check that
+        # there is no overlap that would create ambiguity about how to handle the best state
+        param_ids = self._get_parameter_ids(source.state_dict())
+        if isinstance(source, ModuleDictEMA):
+            logger.debug(f&#34;Registering to best state: ModuleDictEMA &#39;{name}&#39; with {len(param_ids)} params&#34;)
+            self._validate_no_parameter_ids_overlap(name, param_ids)
+            self.param_ids[name] = param_ids
+        else:
+            logger.debug(f&#34;Registering to best state: StateDictSource &#39;{name}&#39; with {len(param_ids)} params&#34;)
+            self._validate_no_parameter_ids_overlap(&#39;base&#39;, param_ids)
+            self.param_ids[&#39;base&#39;].update(param_ids)
+        # Register state
+        self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)
+
+    def state_dict(self) -&gt; flashy.state.StateDict:
+        return self.states
+
+    def load_state_dict(self, state: flashy.state.StateDict):
+        for name, sub_state in state.items():
+            for k, v in sub_state.items():
+                self.states[name][k].copy_(v)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>flashy.state.StateDictSource</li>
+<li>typing.Protocol</li>
+<li>typing.Generic</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.best_state.BestStateDictManager.load_state_dict"><code class="name flex">
+<span>def <span class="ident">load_state_dict</span></span>(<span>self, state: Any)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_state_dict(self, state: flashy.state.StateDict):
+    for name, sub_state in state.items():
+        for k, v in sub_state.items():
+            self.states[name][k].copy_(v)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.best_state.BestStateDictManager.register"><code class="name flex">
+<span>def <span class="ident">register</span></span>(<span>self, name: str, source: flashy.state.StateDictSource)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Register a virtual subclass of an ABC.</p>
+<p>Returns the subclass, to allow usage as a class decorator.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def register(self, name: str, source: flashy.state.StateDictSource):
+    if name in self.states:
+        raise ValueError(f&#34;{name} already present in states.&#34;)
+    # Registering parameter ids for EMA and non-EMA states allows us to check that
+    # there is no overlap that would create ambiguity about how to handle the best state
+    param_ids = self._get_parameter_ids(source.state_dict())
+    if isinstance(source, ModuleDictEMA):
+        logger.debug(f&#34;Registering to best state: ModuleDictEMA &#39;{name}&#39; with {len(param_ids)} params&#34;)
+        self._validate_no_parameter_ids_overlap(name, param_ids)
+        self.param_ids[name] = param_ids
+    else:
+        logger.debug(f&#34;Registering to best state: StateDictSource &#39;{name}&#39; with {len(param_ids)} params&#34;)
+        self._validate_no_parameter_ids_overlap(&#39;base&#39;, param_ids)
+        self.param_ids[&#39;base&#39;].update(param_ids)
+    # Register state
+    self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.best_state.BestStateDictManager.state_dict"><code class="name flex">
+<span>def <span class="ident">state_dict</span></span>(<span>self) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def state_dict(self) -&gt; flashy.state.StateDict:
+    return self.states</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.best_state.BestStateDictManager.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, name: str, source: flashy.state.StateDictSource)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, name: str, source: flashy.state.StateDictSource):
+    if name not in self.states:
+        raise ValueError(f&#34;{name} missing from registered states.&#34;)
+    self.states[name] = copy_state(source.state_dict(), device=self.device, dtype=self.dtype)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.best_state.BestStateDictManager" href="#audiocraft.utils.best_state.BestStateDictManager">BestStateDictManager</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.best_state.BestStateDictManager.load_state_dict" href="#audiocraft.utils.best_state.BestStateDictManager.load_state_dict">load_state_dict</a></code></li>
+<li><code><a title="audiocraft.utils.best_state.BestStateDictManager.register" href="#audiocraft.utils.best_state.BestStateDictManager.register">register</a></code></li>
+<li><code><a title="audiocraft.utils.best_state.BestStateDictManager.state_dict" href="#audiocraft.utils.best_state.BestStateDictManager.state_dict">state_dict</a></code></li>
+<li><code><a title="audiocraft.utils.best_state.BestStateDictManager.update" href="#audiocraft.utils.best_state.BestStateDictManager.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/cache.html b/api_docs/audiocraft/utils/cache.html
new file mode 100644
index 00000000..0a19c1b7
--- /dev/null
+++ b/api_docs/audiocraft/utils/cache.html
@@ -0,0 +1,1005 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.cache API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.cache</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from concurrent.futures import ThreadPoolExecutor
+from collections import deque
+from functools import partial
+from hashlib import sha1
+import logging
+from pathlib import Path
+import sys
+import typing as tp
+import zipfile
+
+import flashy
+import torch
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_full_embed(full_embed: torch.Tensor, x: tp.Any, idx: int, device: tp.Union[str, torch.device]) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function for the EmbeddingCache, returning the full embedding without any chunking.
+    This method can be used in case there is no need in extracting a chunk of the full embedding
+    read from the cache.
+
+    Args:
+        full_embed (torch.Tensor): The full embedding.
+        x (any): Batch object from which the full embedding is derived.
+        idx (torch.Tensor): Index of object to consider in the batch object.
+    Returns:
+        full_embed (torch.Tensor): The full embedding
+    &#34;&#34;&#34;
+    return full_embed.to(device)
+
+
+class EmbeddingCache:
+    &#34;&#34;&#34;Cache around embeddings computation for faster execution.
+    The EmbeddingCache is storing pre-computed embeddings on disk and provides a simple API
+    to retrieve the pre-computed embeddings on full inputs and extract only a given chunk
+    using a user-provided function. When the cache is warm (all embeddings are pre-computed),
+    the EmbeddingCache allows for faster training as it removes the need of computing the embeddings.
+    Additionally, it provides in-memory cache around the loaded embeddings to limit IO footprint
+    and synchronization points in the forward calls.
+
+    Args:
+        cache_path (Path): Path to folder where all pre-computed embeddings are saved on disk.
+        device (str or torch.device): Device on which the embedding is returned.
+        compute_embed_fn (callable[[Path, any, int], torch.Tensor], optional): Function to compute
+            the embedding from a given object and path. This user provided function can compute the
+            embedding from the provided object or using the provided path as entry point. The last parameter
+            specify the index corresponding to the current embedding in the object that can represent batch metadata.
+        extract_embed_fn (callable[[torch.Tensor, any, int], torch.Tensor], optional): Function to extract
+            the desired embedding chunk from the full embedding loaded from the cache. The last parameter
+            specify the index corresponding to the current embedding in the object that can represent batch metadata.
+            If not specified, will return the full embedding unmodified.
+    &#34;&#34;&#34;
+    def __init__(self, cache_path: tp.Union[str, Path], device: tp.Union[str, torch.device],
+                 compute_embed_fn: tp.Callable[[Path, tp.Any, int], torch.Tensor],
+                 extract_embed_fn: tp.Optional[tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]] = None):
+        self.cache_path = Path(cache_path)
+        self.device = device
+        self._compute_embed_fn = compute_embed_fn
+        self._extract_embed_fn: tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]
+        if extract_embed_fn is not None:
+            self._extract_embed_fn = extract_embed_fn
+        else:
+            self._extract_embed_fn = partial(get_full_embed, device=device)
+        if self.cache_path is not None:
+            self.cache_path.mkdir(exist_ok=True, parents=True)
+            logger.info(f&#34;Cache instantiated at: {self.cache_path}&#34;)
+            self.pool = ThreadPoolExecutor(8)
+            self.pool.__enter__()
+        self._current_batch_cache: dict = {}
+        self._memory_cache: dict = {}
+
+    def _get_cache_path(self, path: tp.Union[Path, str]):
+        &#34;&#34;&#34;Get cache path for the given file path.&#34;&#34;&#34;
+        sig = sha1(str(path).encode()).hexdigest()
+        return self.cache_path / sig
+
+    @staticmethod
+    def _get_full_embed_from_cache(cache: Path):
+        &#34;&#34;&#34;Loads full pre-computed embedding from the cache.&#34;&#34;&#34;
+        try:
+            embed = torch.load(cache, &#39;cpu&#39;)
+        except Exception as exc:
+            logger.error(&#34;Error loading %s: %r&#34;, cache, exc)
+            embed = None
+        return embed
+
+    def get_embed_from_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get embedding from cache, computing and storing it to cache if not already cached.
+        The EmbeddingCache first tries to load the embedding from the in-memory cache
+        containing the pre-computed chunks populated through `populate_embed_cache`.
+        If not found, the full embedding is computed and stored on disk to be later accessed
+        to populate the in-memory cache, and the desired embedding chunk is extracted and returned.
+
+        Args:
+            paths (list[Path or str]): List of paths from where the embeddings can be loaded.
+            x (any): Object from which the embedding is extracted.
+        &#34;&#34;&#34;
+        embeds = []
+        for idx, path in enumerate(paths):
+            cache = self._get_cache_path(path)
+            if cache in self._current_batch_cache:
+                embed = self._current_batch_cache[cache]
+            else:
+                full_embed = self._compute_embed_fn(path, x, idx)
+                try:
+                    with flashy.utils.write_and_rename(cache, pid=True) as f:
+                        torch.save(full_embed.cpu(), f)
+                except Exception as exc:
+                    logger.error(&#39;Error saving embed %s (%s): %r&#39;, cache, full_embed.shape, exc)
+                else:
+                    logger.info(&#39;New embed cache saved: %s (%s)&#39;, cache, full_embed.shape)
+                    embed = self._extract_embed_fn(full_embed, x, idx)
+            embeds.append(embed)
+        embed = torch.stack(embeds, dim=0)
+        return embed
+
+    def populate_embed_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; None:
+        &#34;&#34;&#34;Populate in-memory caches for embeddings reading from the embeddings stored on disk.
+        The in-memory caches consist in a cache for the full embedding and another cache for the
+        final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
+        and reduce the IO footprint and synchronization points during forward passes.
+
+        Args:
+            paths (list[Path]): List of paths from where the embeddings can be loaded.
+            x (any): Object from which the embedding is extracted.
+        &#34;&#34;&#34;
+        self._current_batch_cache.clear()
+        if self.cache_path is not None:
+            futures: list = []
+            for path in paths:
+                assert path is not None, &#34;Path is required for computation from cache&#34;
+                cache = self._get_cache_path(path)
+                if cache in self._memory_cache or not cache.exists():
+                    futures.append(None)
+                else:
+                    futures.append(self.pool.submit(EmbeddingCache._get_full_embed_from_cache, cache))
+            for idx, (path, future) in enumerate(zip(paths, futures)):
+                assert path is not None
+                cache = self._get_cache_path(path)
+                full_embed = None
+                if future is None:
+                    if cache in self._memory_cache:
+                        full_embed = self._memory_cache[cache]
+                else:
+                    full_embed = future.result()
+                    if full_embed is not None:
+                        self._memory_cache[cache] = full_embed
+                        full_embed = full_embed.to(self.device)
+                if full_embed is not None:
+                    embed = self._extract_embed_fn(full_embed, x, idx)
+                    self._current_batch_cache[cache] = embed
+
+
+class CachedBatchWriter:
+    &#34;&#34;&#34;Write pre computed caches for mini batches. This can
+    make loading a lot more efficient depending on your filesystem.
+
+    Args:
+        cache_folder (Path): folder in which the cached minibatches
+            will be stored.
+
+    Inside cache folder, the structure is the following:
+    `epoch_number / update_number.zip`
+    And the zip file contains one entry per batch item.
+
+    It is possible to use the cache with a batch size smaller than
+    created with but obviously not larger. Make sure to call the
+    `start_epoch(epoch)` method for indicating changes of epochs.
+
+    See the grid `audiocraft/grids/musicgen/musicgen_warmup_cache.py`
+    for an example of how to warmup the cache.
+    &#34;&#34;&#34;
+    def __init__(self, cache_folder: Path):
+        self.cache_folder = cache_folder
+        self._current_epoch: tp.Optional[int] = None
+        self._current_index = 0
+
+    def start_epoch(self, epoch: int):
+        &#34;&#34;&#34;Call at the beginning of each epoch.
+        &#34;&#34;&#34;
+        self._current_epoch = epoch
+        self._current_index = 0
+        self._zip_path.parent.mkdir(exist_ok=True, parents=True)
+
+    @staticmethod
+    def _get_zip_path(cache_folder: Path, epoch: int, index: int):
+        return cache_folder / f&#34;{epoch:05d}&#34; / f&#34;{index:06d}.zip&#34;
+
+    @property
+    def _zip_path(self):
+        assert self._current_epoch is not None
+        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, self._current_index)
+
+    def save(self, *content):
+        &#34;&#34;&#34;Save one mini batch. This function is distributed-aware
+        and will automatically merge all the items from the different
+        workers.
+        &#34;&#34;&#34;
+        all_contents = []
+        for rank in range(flashy.distrib.world_size()):
+            their_content = flashy.distrib.broadcast_object(content, src=rank)
+            all_contents.append(their_content)
+
+        if flashy.distrib.is_rank_zero():
+            idx = 0
+            with flashy.utils.write_and_rename(self._zip_path) as tmp:
+                with zipfile.ZipFile(tmp, &#39;w&#39;) as zf:
+                    for content in all_contents:
+                        for vals in zip(*content):
+                            with zf.open(f&#39;{idx}&#39;, &#39;w&#39;) as f:  # type: ignore
+                                torch.save(vals, f)
+                            idx += 1
+        flashy.distrib.barrier()
+        self._current_index += 1
+
+
+class CachedBatchLoader:
+    &#34;&#34;&#34;Loader for cached mini-batches dumped with `CachedBatchWriter`.
+
+    Args:
+        cache_folder (Path): folder in which the cached minibatches are stored.
+        batch_size (int): batch size (per GPU) expected.
+        num_workers (int): number of workers to use for loading.
+        min_length (int): minimum expected length for each epoch. If some
+            mini-batches are missing, and error is raised.
+
+    This is iterable just like a regular DataLoader.
+    &#34;&#34;&#34;
+
+    def __init__(self, cache_folder: Path, batch_size: int,
+                 num_workers: int = 10, min_length: int = 1):
+        self.cache_folder = cache_folder
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.min_length = min_length
+        self._current_epoch: tp.Optional[int] = None
+        self.sampler = None  # for compatibility with the regular DataLoader
+
+    def __len__(self):
+        path = CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch or 0, 0).parent
+        return len([p for p in path.iterdir() if p.suffix == &#34;.zip&#34;])
+
+    def start_epoch(self, epoch: int):
+        &#34;&#34;&#34;Call at the beginning of each epoch.
+        &#34;&#34;&#34;
+        self._current_epoch = epoch
+
+    def _zip_path(self, index: int):
+        assert self._current_epoch is not None
+        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, index)
+
+    def _load_one(self, index: int):
+        zip_path = self._zip_path(index)
+        if not zip_path.exists():
+            if index &lt; self.min_length:
+                raise RuntimeError(f&#34;Cache should have at least {self.min_length} batches, but {index} doesn&#39;t exist&#34;)
+
+            return None
+        mode = &#34;rb&#34; if sys.version_info &gt;= (3, 9) else &#34;r&#34;
+        try:
+            with zipfile.ZipFile(zip_path, &#39;r&#39;) as zf:
+                rank = flashy.distrib.rank()
+                world_size = flashy.distrib.world_size()
+                root = zipfile.Path(zf)
+                items = list(root.iterdir())
+                total_batch_size = self.batch_size * world_size
+                if len(items) &lt; total_batch_size:
+                    raise RuntimeError(
+                        f&#34;The cache can handle a max batch size of {len(items)}, &#34;
+                        f&#34;but {total_batch_size} is needed.&#34;)
+                start = rank * self.batch_size
+                items = items[start: start + self.batch_size]
+                assert len(items) == self.batch_size
+                entries = []
+                entries = [torch.load(item.open(mode), &#39;cpu&#39;) for item in items]  # type: ignore
+                transposed = zip(*entries)
+                out = []
+                for part in transposed:
+                    assert len(part) &gt; 0
+                    if isinstance(part[0], torch.Tensor):
+                        out.append(torch.stack(part))
+                    else:
+                        assert isinstance(part, torch.Tensor)
+                        out.append(part)
+                return out
+        except Exception:
+            logger.error(&#34;Error when reading zip path %s&#34;, zip_path)
+            raise
+
+    def __iter__(self):
+        &#34;&#34;&#34;This will yields tuples, exactly as provided to the
+        `CachedBatchWriter.save` method.
+        &#34;&#34;&#34;
+        pool = ThreadPoolExecutor(self.num_workers)
+        next_index = 0
+        queue = deque()
+
+        def _get_next():
+            nonlocal next_index
+            r = queue.popleft().result()
+            if r is None:
+                return None
+            else:
+                queue.append(pool.submit(self._load_one, next_index))
+                next_index += 1
+            return r
+
+        with pool:
+            # fill the buffer of fetching jobs.
+            for _ in range(2 * self.num_workers):
+                queue.append(pool.submit(self._load_one, next_index))
+                next_index += 1
+            while True:
+                batch = _get_next()
+                if batch is None:
+                    return
+                yield batch</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.cache.get_full_embed"><code class="name flex">
+<span>def <span class="ident">get_full_embed</span></span>(<span>full_embed: torch.Tensor, x: Any, idx: int, device: Union[torch.device, str]) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Utility function for the EmbeddingCache, returning the full embedding without any chunking.
+This method can be used in case there is no need in extracting a chunk of the full embedding
+read from the cache.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>full_embed</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>The full embedding.</dd>
+<dt><strong><code>x</code></strong> :&ensp;<code>any</code></dt>
+<dd>Batch object from which the full embedding is derived.</dd>
+<dt><strong><code>idx</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Index of object to consider in the batch object.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>full_embed (torch.Tensor): The full embedding</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_full_embed(full_embed: torch.Tensor, x: tp.Any, idx: int, device: tp.Union[str, torch.device]) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function for the EmbeddingCache, returning the full embedding without any chunking.
+    This method can be used in case there is no need in extracting a chunk of the full embedding
+    read from the cache.
+
+    Args:
+        full_embed (torch.Tensor): The full embedding.
+        x (any): Batch object from which the full embedding is derived.
+        idx (torch.Tensor): Index of object to consider in the batch object.
+    Returns:
+        full_embed (torch.Tensor): The full embedding
+    &#34;&#34;&#34;
+    return full_embed.to(device)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.cache.CachedBatchLoader"><code class="flex name class">
+<span>class <span class="ident">CachedBatchLoader</span></span>
+<span>(</span><span>cache_folder: pathlib.Path, batch_size: int, num_workers: int = 10, min_length: int = 1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Loader for cached mini-batches dumped with <code><a title="audiocraft.utils.cache.CachedBatchWriter" href="#audiocraft.utils.cache.CachedBatchWriter">CachedBatchWriter</a></code>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cache_folder</code></strong> :&ensp;<code>Path</code></dt>
+<dd>folder in which the cached minibatches are stored.</dd>
+<dt><strong><code>batch_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>batch size (per GPU) expected.</dd>
+<dt><strong><code>num_workers</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of workers to use for loading.</dd>
+<dt><strong><code>min_length</code></strong> :&ensp;<code>int</code></dt>
+<dd>minimum expected length for each epoch. If some
+mini-batches are missing, and error is raised.</dd>
+</dl>
+<p>This is iterable just like a regular DataLoader.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CachedBatchLoader:
+    &#34;&#34;&#34;Loader for cached mini-batches dumped with `CachedBatchWriter`.
+
+    Args:
+        cache_folder (Path): folder in which the cached minibatches are stored.
+        batch_size (int): batch size (per GPU) expected.
+        num_workers (int): number of workers to use for loading.
+        min_length (int): minimum expected length for each epoch. If some
+            mini-batches are missing, and error is raised.
+
+    This is iterable just like a regular DataLoader.
+    &#34;&#34;&#34;
+
+    def __init__(self, cache_folder: Path, batch_size: int,
+                 num_workers: int = 10, min_length: int = 1):
+        self.cache_folder = cache_folder
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.min_length = min_length
+        self._current_epoch: tp.Optional[int] = None
+        self.sampler = None  # for compatibility with the regular DataLoader
+
+    def __len__(self):
+        path = CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch or 0, 0).parent
+        return len([p for p in path.iterdir() if p.suffix == &#34;.zip&#34;])
+
+    def start_epoch(self, epoch: int):
+        &#34;&#34;&#34;Call at the beginning of each epoch.
+        &#34;&#34;&#34;
+        self._current_epoch = epoch
+
+    def _zip_path(self, index: int):
+        assert self._current_epoch is not None
+        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, index)
+
+    def _load_one(self, index: int):
+        zip_path = self._zip_path(index)
+        if not zip_path.exists():
+            if index &lt; self.min_length:
+                raise RuntimeError(f&#34;Cache should have at least {self.min_length} batches, but {index} doesn&#39;t exist&#34;)
+
+            return None
+        mode = &#34;rb&#34; if sys.version_info &gt;= (3, 9) else &#34;r&#34;
+        try:
+            with zipfile.ZipFile(zip_path, &#39;r&#39;) as zf:
+                rank = flashy.distrib.rank()
+                world_size = flashy.distrib.world_size()
+                root = zipfile.Path(zf)
+                items = list(root.iterdir())
+                total_batch_size = self.batch_size * world_size
+                if len(items) &lt; total_batch_size:
+                    raise RuntimeError(
+                        f&#34;The cache can handle a max batch size of {len(items)}, &#34;
+                        f&#34;but {total_batch_size} is needed.&#34;)
+                start = rank * self.batch_size
+                items = items[start: start + self.batch_size]
+                assert len(items) == self.batch_size
+                entries = []
+                entries = [torch.load(item.open(mode), &#39;cpu&#39;) for item in items]  # type: ignore
+                transposed = zip(*entries)
+                out = []
+                for part in transposed:
+                    assert len(part) &gt; 0
+                    if isinstance(part[0], torch.Tensor):
+                        out.append(torch.stack(part))
+                    else:
+                        assert isinstance(part, torch.Tensor)
+                        out.append(part)
+                return out
+        except Exception:
+            logger.error(&#34;Error when reading zip path %s&#34;, zip_path)
+            raise
+
+    def __iter__(self):
+        &#34;&#34;&#34;This will yields tuples, exactly as provided to the
+        `CachedBatchWriter.save` method.
+        &#34;&#34;&#34;
+        pool = ThreadPoolExecutor(self.num_workers)
+        next_index = 0
+        queue = deque()
+
+        def _get_next():
+            nonlocal next_index
+            r = queue.popleft().result()
+            if r is None:
+                return None
+            else:
+                queue.append(pool.submit(self._load_one, next_index))
+                next_index += 1
+            return r
+
+        with pool:
+            # fill the buffer of fetching jobs.
+            for _ in range(2 * self.num_workers):
+                queue.append(pool.submit(self._load_one, next_index))
+                next_index += 1
+            while True:
+                batch = _get_next()
+                if batch is None:
+                    return
+                yield batch</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.cache.CachedBatchLoader.start_epoch"><code class="name flex">
+<span>def <span class="ident">start_epoch</span></span>(<span>self, epoch: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Call at the beginning of each epoch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def start_epoch(self, epoch: int):
+    &#34;&#34;&#34;Call at the beginning of each epoch.
+    &#34;&#34;&#34;
+    self._current_epoch = epoch</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.utils.cache.CachedBatchWriter"><code class="flex name class">
+<span>class <span class="ident">CachedBatchWriter</span></span>
+<span>(</span><span>cache_folder: pathlib.Path)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Write pre computed caches for mini batches. This can
+make loading a lot more efficient depending on your filesystem.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cache_folder</code></strong> :&ensp;<code>Path</code></dt>
+<dd>folder in which the cached minibatches
+will be stored.</dd>
+</dl>
+<p>Inside cache folder, the structure is the following:
+<code>epoch_number / update_number.zip</code>
+And the zip file contains one entry per batch item.</p>
+<p>It is possible to use the cache with a batch size smaller than
+created with but obviously not larger. Make sure to call the
+<code>start_epoch(epoch)</code> method for indicating changes of epochs.</p>
+<p>See the grid <code>audiocraft/grids/musicgen/musicgen_warmup_cache.py</code>
+for an example of how to warmup the cache.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CachedBatchWriter:
+    &#34;&#34;&#34;Write pre computed caches for mini batches. This can
+    make loading a lot more efficient depending on your filesystem.
+
+    Args:
+        cache_folder (Path): folder in which the cached minibatches
+            will be stored.
+
+    Inside cache folder, the structure is the following:
+    `epoch_number / update_number.zip`
+    And the zip file contains one entry per batch item.
+
+    It is possible to use the cache with a batch size smaller than
+    created with but obviously not larger. Make sure to call the
+    `start_epoch(epoch)` method for indicating changes of epochs.
+
+    See the grid `audiocraft/grids/musicgen/musicgen_warmup_cache.py`
+    for an example of how to warmup the cache.
+    &#34;&#34;&#34;
+    def __init__(self, cache_folder: Path):
+        self.cache_folder = cache_folder
+        self._current_epoch: tp.Optional[int] = None
+        self._current_index = 0
+
+    def start_epoch(self, epoch: int):
+        &#34;&#34;&#34;Call at the beginning of each epoch.
+        &#34;&#34;&#34;
+        self._current_epoch = epoch
+        self._current_index = 0
+        self._zip_path.parent.mkdir(exist_ok=True, parents=True)
+
+    @staticmethod
+    def _get_zip_path(cache_folder: Path, epoch: int, index: int):
+        return cache_folder / f&#34;{epoch:05d}&#34; / f&#34;{index:06d}.zip&#34;
+
+    @property
+    def _zip_path(self):
+        assert self._current_epoch is not None
+        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, self._current_index)
+
+    def save(self, *content):
+        &#34;&#34;&#34;Save one mini batch. This function is distributed-aware
+        and will automatically merge all the items from the different
+        workers.
+        &#34;&#34;&#34;
+        all_contents = []
+        for rank in range(flashy.distrib.world_size()):
+            their_content = flashy.distrib.broadcast_object(content, src=rank)
+            all_contents.append(their_content)
+
+        if flashy.distrib.is_rank_zero():
+            idx = 0
+            with flashy.utils.write_and_rename(self._zip_path) as tmp:
+                with zipfile.ZipFile(tmp, &#39;w&#39;) as zf:
+                    for content in all_contents:
+                        for vals in zip(*content):
+                            with zf.open(f&#39;{idx}&#39;, &#39;w&#39;) as f:  # type: ignore
+                                torch.save(vals, f)
+                            idx += 1
+        flashy.distrib.barrier()
+        self._current_index += 1</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.cache.CachedBatchWriter.save"><code class="name flex">
+<span>def <span class="ident">save</span></span>(<span>self, *content)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Save one mini batch. This function is distributed-aware
+and will automatically merge all the items from the different
+workers.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def save(self, *content):
+    &#34;&#34;&#34;Save one mini batch. This function is distributed-aware
+    and will automatically merge all the items from the different
+    workers.
+    &#34;&#34;&#34;
+    all_contents = []
+    for rank in range(flashy.distrib.world_size()):
+        their_content = flashy.distrib.broadcast_object(content, src=rank)
+        all_contents.append(their_content)
+
+    if flashy.distrib.is_rank_zero():
+        idx = 0
+        with flashy.utils.write_and_rename(self._zip_path) as tmp:
+            with zipfile.ZipFile(tmp, &#39;w&#39;) as zf:
+                for content in all_contents:
+                    for vals in zip(*content):
+                        with zf.open(f&#39;{idx}&#39;, &#39;w&#39;) as f:  # type: ignore
+                            torch.save(vals, f)
+                        idx += 1
+    flashy.distrib.barrier()
+    self._current_index += 1</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.cache.CachedBatchWriter.start_epoch"><code class="name flex">
+<span>def <span class="ident">start_epoch</span></span>(<span>self, epoch: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Call at the beginning of each epoch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def start_epoch(self, epoch: int):
+    &#34;&#34;&#34;Call at the beginning of each epoch.
+    &#34;&#34;&#34;
+    self._current_epoch = epoch
+    self._current_index = 0
+    self._zip_path.parent.mkdir(exist_ok=True, parents=True)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.utils.cache.EmbeddingCache"><code class="flex name class">
+<span>class <span class="ident">EmbeddingCache</span></span>
+<span>(</span><span>cache_path: Union[str, pathlib.Path], device: Union[torch.device, str], compute_embed_fn: Callable[[pathlib.Path, Any, int], torch.Tensor], extract_embed_fn: Optional[Callable[[torch.Tensor, Any, int], torch.Tensor]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Cache around embeddings computation for faster execution.
+The EmbeddingCache is storing pre-computed embeddings on disk and provides a simple API
+to retrieve the pre-computed embeddings on full inputs and extract only a given chunk
+using a user-provided function. When the cache is warm (all embeddings are pre-computed),
+the EmbeddingCache allows for faster training as it removes the need of computing the embeddings.
+Additionally, it provides in-memory cache around the loaded embeddings to limit IO footprint
+and synchronization points in the forward calls.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cache_path</code></strong> :&ensp;<code>Path</code></dt>
+<dd>Path to folder where all pre-computed embeddings are saved on disk.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code> or <code>torch.device</code></dt>
+<dd>Device on which the embedding is returned.</dd>
+<dt><strong><code>compute_embed_fn</code></strong> :&ensp;<code>callable[[Path, any, int], torch.Tensor]</code>, optional</dt>
+<dd>Function to compute
+the embedding from a given object and path. This user provided function can compute the
+embedding from the provided object or using the provided path as entry point. The last parameter
+specify the index corresponding to the current embedding in the object that can represent batch metadata.</dd>
+<dt><strong><code>extract_embed_fn</code></strong> :&ensp;<code>callable[[torch.Tensor, any, int], torch.Tensor]</code>, optional</dt>
+<dd>Function to extract
+the desired embedding chunk from the full embedding loaded from the cache. The last parameter
+specify the index corresponding to the current embedding in the object that can represent batch metadata.
+If not specified, will return the full embedding unmodified.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EmbeddingCache:
+    &#34;&#34;&#34;Cache around embeddings computation for faster execution.
+    The EmbeddingCache is storing pre-computed embeddings on disk and provides a simple API
+    to retrieve the pre-computed embeddings on full inputs and extract only a given chunk
+    using a user-provided function. When the cache is warm (all embeddings are pre-computed),
+    the EmbeddingCache allows for faster training as it removes the need of computing the embeddings.
+    Additionally, it provides in-memory cache around the loaded embeddings to limit IO footprint
+    and synchronization points in the forward calls.
+
+    Args:
+        cache_path (Path): Path to folder where all pre-computed embeddings are saved on disk.
+        device (str or torch.device): Device on which the embedding is returned.
+        compute_embed_fn (callable[[Path, any, int], torch.Tensor], optional): Function to compute
+            the embedding from a given object and path. This user provided function can compute the
+            embedding from the provided object or using the provided path as entry point. The last parameter
+            specify the index corresponding to the current embedding in the object that can represent batch metadata.
+        extract_embed_fn (callable[[torch.Tensor, any, int], torch.Tensor], optional): Function to extract
+            the desired embedding chunk from the full embedding loaded from the cache. The last parameter
+            specify the index corresponding to the current embedding in the object that can represent batch metadata.
+            If not specified, will return the full embedding unmodified.
+    &#34;&#34;&#34;
+    def __init__(self, cache_path: tp.Union[str, Path], device: tp.Union[str, torch.device],
+                 compute_embed_fn: tp.Callable[[Path, tp.Any, int], torch.Tensor],
+                 extract_embed_fn: tp.Optional[tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]] = None):
+        self.cache_path = Path(cache_path)
+        self.device = device
+        self._compute_embed_fn = compute_embed_fn
+        self._extract_embed_fn: tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]
+        if extract_embed_fn is not None:
+            self._extract_embed_fn = extract_embed_fn
+        else:
+            self._extract_embed_fn = partial(get_full_embed, device=device)
+        if self.cache_path is not None:
+            self.cache_path.mkdir(exist_ok=True, parents=True)
+            logger.info(f&#34;Cache instantiated at: {self.cache_path}&#34;)
+            self.pool = ThreadPoolExecutor(8)
+            self.pool.__enter__()
+        self._current_batch_cache: dict = {}
+        self._memory_cache: dict = {}
+
+    def _get_cache_path(self, path: tp.Union[Path, str]):
+        &#34;&#34;&#34;Get cache path for the given file path.&#34;&#34;&#34;
+        sig = sha1(str(path).encode()).hexdigest()
+        return self.cache_path / sig
+
+    @staticmethod
+    def _get_full_embed_from_cache(cache: Path):
+        &#34;&#34;&#34;Loads full pre-computed embedding from the cache.&#34;&#34;&#34;
+        try:
+            embed = torch.load(cache, &#39;cpu&#39;)
+        except Exception as exc:
+            logger.error(&#34;Error loading %s: %r&#34;, cache, exc)
+            embed = None
+        return embed
+
+    def get_embed_from_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Get embedding from cache, computing and storing it to cache if not already cached.
+        The EmbeddingCache first tries to load the embedding from the in-memory cache
+        containing the pre-computed chunks populated through `populate_embed_cache`.
+        If not found, the full embedding is computed and stored on disk to be later accessed
+        to populate the in-memory cache, and the desired embedding chunk is extracted and returned.
+
+        Args:
+            paths (list[Path or str]): List of paths from where the embeddings can be loaded.
+            x (any): Object from which the embedding is extracted.
+        &#34;&#34;&#34;
+        embeds = []
+        for idx, path in enumerate(paths):
+            cache = self._get_cache_path(path)
+            if cache in self._current_batch_cache:
+                embed = self._current_batch_cache[cache]
+            else:
+                full_embed = self._compute_embed_fn(path, x, idx)
+                try:
+                    with flashy.utils.write_and_rename(cache, pid=True) as f:
+                        torch.save(full_embed.cpu(), f)
+                except Exception as exc:
+                    logger.error(&#39;Error saving embed %s (%s): %r&#39;, cache, full_embed.shape, exc)
+                else:
+                    logger.info(&#39;New embed cache saved: %s (%s)&#39;, cache, full_embed.shape)
+                    embed = self._extract_embed_fn(full_embed, x, idx)
+            embeds.append(embed)
+        embed = torch.stack(embeds, dim=0)
+        return embed
+
+    def populate_embed_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; None:
+        &#34;&#34;&#34;Populate in-memory caches for embeddings reading from the embeddings stored on disk.
+        The in-memory caches consist in a cache for the full embedding and another cache for the
+        final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
+        and reduce the IO footprint and synchronization points during forward passes.
+
+        Args:
+            paths (list[Path]): List of paths from where the embeddings can be loaded.
+            x (any): Object from which the embedding is extracted.
+        &#34;&#34;&#34;
+        self._current_batch_cache.clear()
+        if self.cache_path is not None:
+            futures: list = []
+            for path in paths:
+                assert path is not None, &#34;Path is required for computation from cache&#34;
+                cache = self._get_cache_path(path)
+                if cache in self._memory_cache or not cache.exists():
+                    futures.append(None)
+                else:
+                    futures.append(self.pool.submit(EmbeddingCache._get_full_embed_from_cache, cache))
+            for idx, (path, future) in enumerate(zip(paths, futures)):
+                assert path is not None
+                cache = self._get_cache_path(path)
+                full_embed = None
+                if future is None:
+                    if cache in self._memory_cache:
+                        full_embed = self._memory_cache[cache]
+                else:
+                    full_embed = future.result()
+                    if full_embed is not None:
+                        self._memory_cache[cache] = full_embed
+                        full_embed = full_embed.to(self.device)
+                if full_embed is not None:
+                    embed = self._extract_embed_fn(full_embed, x, idx)
+                    self._current_batch_cache[cache] = embed</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.cache.EmbeddingCache.get_embed_from_cache"><code class="name flex">
+<span>def <span class="ident">get_embed_from_cache</span></span>(<span>self, paths: List[pathlib.Path], x: Any) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get embedding from cache, computing and storing it to cache if not already cached.
+The EmbeddingCache first tries to load the embedding from the in-memory cache
+containing the pre-computed chunks populated through <code>populate_embed_cache</code>.
+If not found, the full embedding is computed and stored on disk to be later accessed
+to populate the in-memory cache, and the desired embedding chunk is extracted and returned.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>paths</code></strong> :&ensp;<code>list[Path</code> or <code>str]</code></dt>
+<dd>List of paths from where the embeddings can be loaded.</dd>
+<dt><strong><code>x</code></strong> :&ensp;<code>any</code></dt>
+<dd>Object from which the embedding is extracted.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_embed_from_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Get embedding from cache, computing and storing it to cache if not already cached.
+    The EmbeddingCache first tries to load the embedding from the in-memory cache
+    containing the pre-computed chunks populated through `populate_embed_cache`.
+    If not found, the full embedding is computed and stored on disk to be later accessed
+    to populate the in-memory cache, and the desired embedding chunk is extracted and returned.
+
+    Args:
+        paths (list[Path or str]): List of paths from where the embeddings can be loaded.
+        x (any): Object from which the embedding is extracted.
+    &#34;&#34;&#34;
+    embeds = []
+    for idx, path in enumerate(paths):
+        cache = self._get_cache_path(path)
+        if cache in self._current_batch_cache:
+            embed = self._current_batch_cache[cache]
+        else:
+            full_embed = self._compute_embed_fn(path, x, idx)
+            try:
+                with flashy.utils.write_and_rename(cache, pid=True) as f:
+                    torch.save(full_embed.cpu(), f)
+            except Exception as exc:
+                logger.error(&#39;Error saving embed %s (%s): %r&#39;, cache, full_embed.shape, exc)
+            else:
+                logger.info(&#39;New embed cache saved: %s (%s)&#39;, cache, full_embed.shape)
+                embed = self._extract_embed_fn(full_embed, x, idx)
+        embeds.append(embed)
+    embed = torch.stack(embeds, dim=0)
+    return embed</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.cache.EmbeddingCache.populate_embed_cache"><code class="name flex">
+<span>def <span class="ident">populate_embed_cache</span></span>(<span>self, paths: List[pathlib.Path], x: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Populate in-memory caches for embeddings reading from the embeddings stored on disk.
+The in-memory caches consist in a cache for the full embedding and another cache for the
+final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
+and reduce the IO footprint and synchronization points during forward passes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>paths</code></strong> :&ensp;<code>list[Path]</code></dt>
+<dd>List of paths from where the embeddings can be loaded.</dd>
+<dt><strong><code>x</code></strong> :&ensp;<code>any</code></dt>
+<dd>Object from which the embedding is extracted.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def populate_embed_cache(self, paths: tp.List[Path], x: tp.Any) -&gt; None:
+    &#34;&#34;&#34;Populate in-memory caches for embeddings reading from the embeddings stored on disk.
+    The in-memory caches consist in a cache for the full embedding and another cache for the
+    final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
+    and reduce the IO footprint and synchronization points during forward passes.
+
+    Args:
+        paths (list[Path]): List of paths from where the embeddings can be loaded.
+        x (any): Object from which the embedding is extracted.
+    &#34;&#34;&#34;
+    self._current_batch_cache.clear()
+    if self.cache_path is not None:
+        futures: list = []
+        for path in paths:
+            assert path is not None, &#34;Path is required for computation from cache&#34;
+            cache = self._get_cache_path(path)
+            if cache in self._memory_cache or not cache.exists():
+                futures.append(None)
+            else:
+                futures.append(self.pool.submit(EmbeddingCache._get_full_embed_from_cache, cache))
+        for idx, (path, future) in enumerate(zip(paths, futures)):
+            assert path is not None
+            cache = self._get_cache_path(path)
+            full_embed = None
+            if future is None:
+                if cache in self._memory_cache:
+                    full_embed = self._memory_cache[cache]
+            else:
+                full_embed = future.result()
+                if full_embed is not None:
+                    self._memory_cache[cache] = full_embed
+                    full_embed = full_embed.to(self.device)
+            if full_embed is not None:
+                embed = self._extract_embed_fn(full_embed, x, idx)
+                self._current_batch_cache[cache] = embed</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.cache.get_full_embed" href="#audiocraft.utils.cache.get_full_embed">get_full_embed</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.cache.CachedBatchLoader" href="#audiocraft.utils.cache.CachedBatchLoader">CachedBatchLoader</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.cache.CachedBatchLoader.start_epoch" href="#audiocraft.utils.cache.CachedBatchLoader.start_epoch">start_epoch</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.utils.cache.CachedBatchWriter" href="#audiocraft.utils.cache.CachedBatchWriter">CachedBatchWriter</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.cache.CachedBatchWriter.save" href="#audiocraft.utils.cache.CachedBatchWriter.save">save</a></code></li>
+<li><code><a title="audiocraft.utils.cache.CachedBatchWriter.start_epoch" href="#audiocraft.utils.cache.CachedBatchWriter.start_epoch">start_epoch</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.utils.cache.EmbeddingCache" href="#audiocraft.utils.cache.EmbeddingCache">EmbeddingCache</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.cache.EmbeddingCache.get_embed_from_cache" href="#audiocraft.utils.cache.EmbeddingCache.get_embed_from_cache">get_embed_from_cache</a></code></li>
+<li><code><a title="audiocraft.utils.cache.EmbeddingCache.populate_embed_cache" href="#audiocraft.utils.cache.EmbeddingCache.populate_embed_cache">populate_embed_cache</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/checkpoint.html b/api_docs/audiocraft/utils/checkpoint.html
new file mode 100644
index 00000000..d6a9ab66
--- /dev/null
+++ b/api_docs/audiocraft/utils/checkpoint.html
@@ -0,0 +1,492 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.checkpoint API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.checkpoint</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from pathlib import Path
+import re
+import typing as tp
+
+import flashy
+import torch
+
+from ..environment import AudioCraftEnvironment
+
+
+logger = logging.getLogger(__name__)
+
+
+class CheckpointSource(Enum):
+    CURRENT_XP = &#34;current_xp&#34;
+    PRETRAINED = &#34;pretrained&#34;
+    OTHER = &#34;other&#34;
+
+
+def checkpoint_name(name: tp.Optional[str] = None, rank: tp.Optional[int] = None, use_fsdp: bool = False) -&gt; str:
+    &#34;&#34;&#34;Checkpoint name formatted for all use in AudioCraft codebase and has the following format:
+    `checkpoint_&lt;name&gt;.th(.&lt;rank&gt;)`. By convention, name is expected to be empty for last checkpoint,
+    &#39;best&#39; for the best checkpoint or the epoch number.
+
+    Args:
+        name (str, optional): Name suffix for the checkpoint file stem.
+        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
+        use_fsdp (bool): Whether the calling solver relies on FSDP.
+    Returns:
+        str: The checkpoint name.
+    &#34;&#34;&#34;
+    suffix = &#39;&#39;
+    if rank is None:
+        rank = flashy.distrib.rank()
+    if rank &gt; 0 and use_fsdp:
+        suffix = &#39;.&#39; + str(rank)
+    name_part = &#39;&#39;
+    if name is not None:
+        name_part = f&#39;_{name}&#39;
+    return f&#39;checkpoint{name_part}.th{suffix}&#39;
+
+
+def is_sharded_checkpoint(path: Path) -&gt; bool:
+    &#34;&#34;&#34;Whether the checkpoint at the given path corresponds to a sharded checkpoint across rank.&#34;&#34;&#34;
+    return re.search(r&#39;\.th\.\d+$&#39;, path.name) is not None
+
+
+def resolve_checkpoint_path(sig_or_path: tp.Union[Path, str], name: tp.Optional[str] = None,
+                            use_fsdp: bool = False) -&gt; tp.Optional[Path]:
+    &#34;&#34;&#34;Resolve a given checkpoint path for a provided dora sig or path.
+
+    Args:
+        sig_or_path (Path or str): Checkpoint path or dora signature.
+        name (str, optional): Name suffix for the checkpoint file stem.
+        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
+        use_fsdp (bool): Whether the calling solver relies on FSDP.
+    Returns:
+        Path, optional: Resolved checkpoint path, if it exists.
+    &#34;&#34;&#34;
+    from audiocraft import train
+    xps_root = train.main.dora.dir / &#39;xps&#39;
+    sig_or_path = str(sig_or_path)
+    if sig_or_path.startswith(&#39;//sig/&#39;):
+        sig = sig_or_path[len(&#39;//sig/&#39;):]
+        path = xps_root / sig
+    else:
+        path = Path(sig_or_path)
+        path = AudioCraftEnvironment.resolve_reference_path(path)
+
+    if path.is_dir():
+        path = path / checkpoint_name(name, use_fsdp=use_fsdp)
+
+    if path.exists():
+        return path
+    else:
+        return None
+
+
+def load_checkpoint(checkpoint_path: Path, is_sharded: bool = False) -&gt; tp.Any:
+    &#34;&#34;&#34;Load state from checkpoints at the specified checkpoint path.&#34;&#34;&#34;
+    if is_sharded:
+        rank0_checkpoint_path = checkpoint_path.parent / checkpoint_name(use_fsdp=False)
+        if rank0_checkpoint_path.exists():
+            check_sharded_checkpoint(checkpoint_path, rank0_checkpoint_path)
+    state = torch.load(checkpoint_path, &#39;cpu&#39;)
+    logger.info(&#34;Checkpoint loaded from %s&#34;, checkpoint_path)
+    return state
+
+
+def save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -&gt; None:
+    &#34;&#34;&#34;Save state to disk to the specified checkpoint_path.&#34;&#34;&#34;
+    _safe_save_checkpoint(state, checkpoint_path, is_sharded)
+    logger.info(&#34;Checkpoint saved to %s&#34;, checkpoint_path)
+
+
+def flush_stale_checkpoints(checkpoint_path: Path, keep_last: tp.Optional[int] = None) -&gt; None:
+    &#34;&#34;&#34;Flush checkpoints to only keep last N checkpoints.&#34;&#34;&#34;
+    if keep_last is None or keep_last &lt;= 0:
+        return
+    checkpoint_dir = checkpoint_path.parent
+    suffix = &#39;&#39;
+    if flashy.distrib.rank() &gt; 0:
+        suffix = f&#39;.{flashy.distrib.rank()}&#39;
+    checkpoint_files_with_epoch = []
+    for path in Path(checkpoint_dir).glob(f&#39;checkpoint_*.th{suffix}&#39;):
+        epoch_part = path.name.split(&#39;.&#39;, 1)[0].split(&#39;_&#39;, 1)[1]
+        if epoch_part.isdigit():
+            checkpoint_files_with_epoch.append((path, int(epoch_part)))
+    checkpoint_files = [path for path, _ in list(sorted(checkpoint_files_with_epoch, key=lambda t: t[1]))]
+    total_to_flush = max(0, len(checkpoint_files) - keep_last)
+    files_to_flush = checkpoint_files[:total_to_flush]
+    for path in files_to_flush:
+        logger.debug(&#34;Removing checkpoint: %s&#34;, str(path))
+        path.unlink(missing_ok=True)
+
+
+def check_sharded_checkpoint(checkpoint_path: Path, rank0_checkpoint_path: Path) -&gt; None:
+    &#34;&#34;&#34;Check sharded checkpoint state, ensuring the checkpoints are not corrupted.&#34;&#34;&#34;
+    # Finish the work of a previous run that got interrupted while dumping.
+    old_path = Path(str(checkpoint_path) + &#39;.old&#39;)
+    if old_path.exists():
+        raise RuntimeError(
+            f&#34;Old checkpoint {old_path} from previous version of this code exist, cannot safely proceed.&#34;)
+    token = Path(str(rank0_checkpoint_path) + &#39;.tmp.done&#39;)
+    tmp_path = Path(str(checkpoint_path) + &#39;.tmp&#39;)
+    if token.exists():
+        if tmp_path.exists():
+            tmp_path.rename(checkpoint_path)
+    flashy.distrib.barrier()
+    if flashy.distrib.is_rank_zero() and token.exists():
+        token.unlink()
+
+
+def _safe_save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -&gt; None:
+    &#34;&#34;&#34;Save checkpoints in a safe manner even with when sharded checkpoints across nodes.&#34;&#34;&#34;
+    def _barrier_if_sharded():
+        if is_sharded:
+            flashy.distrib.barrier()
+
+    if flashy.distrib.is_rank_zero():
+        token = Path(str(checkpoint_path) + &#39;.tmp.done&#39;)
+        if token.exists():
+            token.unlink()
+    _barrier_if_sharded()
+    with flashy.utils.write_and_rename(checkpoint_path) as f:
+        torch.save(state, f)
+        _barrier_if_sharded()
+        if flashy.distrib.is_rank_zero():
+            token.touch()
+        _barrier_if_sharded()
+    _barrier_if_sharded()
+    if flashy.distrib.rank() == 0:
+        token.unlink()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.checkpoint.check_sharded_checkpoint"><code class="name flex">
+<span>def <span class="ident">check_sharded_checkpoint</span></span>(<span>checkpoint_path: pathlib.Path, rank0_checkpoint_path: pathlib.Path) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Check sharded checkpoint state, ensuring the checkpoints are not corrupted.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def check_sharded_checkpoint(checkpoint_path: Path, rank0_checkpoint_path: Path) -&gt; None:
+    &#34;&#34;&#34;Check sharded checkpoint state, ensuring the checkpoints are not corrupted.&#34;&#34;&#34;
+    # Finish the work of a previous run that got interrupted while dumping.
+    old_path = Path(str(checkpoint_path) + &#39;.old&#39;)
+    if old_path.exists():
+        raise RuntimeError(
+            f&#34;Old checkpoint {old_path} from previous version of this code exist, cannot safely proceed.&#34;)
+    token = Path(str(rank0_checkpoint_path) + &#39;.tmp.done&#39;)
+    tmp_path = Path(str(checkpoint_path) + &#39;.tmp&#39;)
+    if token.exists():
+        if tmp_path.exists():
+            tmp_path.rename(checkpoint_path)
+    flashy.distrib.barrier()
+    if flashy.distrib.is_rank_zero() and token.exists():
+        token.unlink()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.checkpoint_name"><code class="name flex">
+<span>def <span class="ident">checkpoint_name</span></span>(<span>name: Optional[str] = None, rank: Optional[int] = None, use_fsdp: bool = False) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Checkpoint name formatted for all use in AudioCraft codebase and has the following format:
+<code>checkpoint_&lt;name&gt;.th(.&lt;rank&gt;)</code>. By convention, name is expected to be empty for last checkpoint,
+'best' for the best checkpoint or the epoch number.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Name suffix for the checkpoint file stem.</dd>
+<dt><strong><code>rank</code></strong> :&ensp;<code>optional, int</code></dt>
+<dd>Rank for distributed processing, retrieved with flashy if not provided.</dd>
+<dt><strong><code>use_fsdp</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether the calling solver relies on FSDP.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>str</code></dt>
+<dd>The checkpoint name.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def checkpoint_name(name: tp.Optional[str] = None, rank: tp.Optional[int] = None, use_fsdp: bool = False) -&gt; str:
+    &#34;&#34;&#34;Checkpoint name formatted for all use in AudioCraft codebase and has the following format:
+    `checkpoint_&lt;name&gt;.th(.&lt;rank&gt;)`. By convention, name is expected to be empty for last checkpoint,
+    &#39;best&#39; for the best checkpoint or the epoch number.
+
+    Args:
+        name (str, optional): Name suffix for the checkpoint file stem.
+        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
+        use_fsdp (bool): Whether the calling solver relies on FSDP.
+    Returns:
+        str: The checkpoint name.
+    &#34;&#34;&#34;
+    suffix = &#39;&#39;
+    if rank is None:
+        rank = flashy.distrib.rank()
+    if rank &gt; 0 and use_fsdp:
+        suffix = &#39;.&#39; + str(rank)
+    name_part = &#39;&#39;
+    if name is not None:
+        name_part = f&#39;_{name}&#39;
+    return f&#39;checkpoint{name_part}.th{suffix}&#39;</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.flush_stale_checkpoints"><code class="name flex">
+<span>def <span class="ident">flush_stale_checkpoints</span></span>(<span>checkpoint_path: pathlib.Path, keep_last: Optional[int] = None) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Flush checkpoints to only keep last N checkpoints.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def flush_stale_checkpoints(checkpoint_path: Path, keep_last: tp.Optional[int] = None) -&gt; None:
+    &#34;&#34;&#34;Flush checkpoints to only keep last N checkpoints.&#34;&#34;&#34;
+    if keep_last is None or keep_last &lt;= 0:
+        return
+    checkpoint_dir = checkpoint_path.parent
+    suffix = &#39;&#39;
+    if flashy.distrib.rank() &gt; 0:
+        suffix = f&#39;.{flashy.distrib.rank()}&#39;
+    checkpoint_files_with_epoch = []
+    for path in Path(checkpoint_dir).glob(f&#39;checkpoint_*.th{suffix}&#39;):
+        epoch_part = path.name.split(&#39;.&#39;, 1)[0].split(&#39;_&#39;, 1)[1]
+        if epoch_part.isdigit():
+            checkpoint_files_with_epoch.append((path, int(epoch_part)))
+    checkpoint_files = [path for path, _ in list(sorted(checkpoint_files_with_epoch, key=lambda t: t[1]))]
+    total_to_flush = max(0, len(checkpoint_files) - keep_last)
+    files_to_flush = checkpoint_files[:total_to_flush]
+    for path in files_to_flush:
+        logger.debug(&#34;Removing checkpoint: %s&#34;, str(path))
+        path.unlink(missing_ok=True)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.is_sharded_checkpoint"><code class="name flex">
+<span>def <span class="ident">is_sharded_checkpoint</span></span>(<span>path: pathlib.Path) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Whether the checkpoint at the given path corresponds to a sharded checkpoint across rank.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def is_sharded_checkpoint(path: Path) -&gt; bool:
+    &#34;&#34;&#34;Whether the checkpoint at the given path corresponds to a sharded checkpoint across rank.&#34;&#34;&#34;
+    return re.search(r&#39;\.th\.\d+$&#39;, path.name) is not None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.load_checkpoint"><code class="name flex">
+<span>def <span class="ident">load_checkpoint</span></span>(<span>checkpoint_path: pathlib.Path, is_sharded: bool = False) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Load state from checkpoints at the specified checkpoint path.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_checkpoint(checkpoint_path: Path, is_sharded: bool = False) -&gt; tp.Any:
+    &#34;&#34;&#34;Load state from checkpoints at the specified checkpoint path.&#34;&#34;&#34;
+    if is_sharded:
+        rank0_checkpoint_path = checkpoint_path.parent / checkpoint_name(use_fsdp=False)
+        if rank0_checkpoint_path.exists():
+            check_sharded_checkpoint(checkpoint_path, rank0_checkpoint_path)
+    state = torch.load(checkpoint_path, &#39;cpu&#39;)
+    logger.info(&#34;Checkpoint loaded from %s&#34;, checkpoint_path)
+    return state</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.resolve_checkpoint_path"><code class="name flex">
+<span>def <span class="ident">resolve_checkpoint_path</span></span>(<span>sig_or_path: Union[str, pathlib.Path], name: Optional[str] = None, use_fsdp: bool = False) ‑> Optional[pathlib.Path]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Resolve a given checkpoint path for a provided dora sig or path.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sig_or_path</code></strong> :&ensp;<code>Path</code> or <code>str</code></dt>
+<dd>Checkpoint path or dora signature.</dd>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Name suffix for the checkpoint file stem.</dd>
+<dt><strong><code>rank</code></strong> :&ensp;<code>optional, int</code></dt>
+<dd>Rank for distributed processing, retrieved with flashy if not provided.</dd>
+<dt><strong><code>use_fsdp</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether the calling solver relies on FSDP.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>Path</code>, optional</dt>
+<dd>Resolved checkpoint path, if it exists.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def resolve_checkpoint_path(sig_or_path: tp.Union[Path, str], name: tp.Optional[str] = None,
+                            use_fsdp: bool = False) -&gt; tp.Optional[Path]:
+    &#34;&#34;&#34;Resolve a given checkpoint path for a provided dora sig or path.
+
+    Args:
+        sig_or_path (Path or str): Checkpoint path or dora signature.
+        name (str, optional): Name suffix for the checkpoint file stem.
+        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
+        use_fsdp (bool): Whether the calling solver relies on FSDP.
+    Returns:
+        Path, optional: Resolved checkpoint path, if it exists.
+    &#34;&#34;&#34;
+    from audiocraft import train
+    xps_root = train.main.dora.dir / &#39;xps&#39;
+    sig_or_path = str(sig_or_path)
+    if sig_or_path.startswith(&#39;//sig/&#39;):
+        sig = sig_or_path[len(&#39;//sig/&#39;):]
+        path = xps_root / sig
+    else:
+        path = Path(sig_or_path)
+        path = AudioCraftEnvironment.resolve_reference_path(path)
+
+    if path.is_dir():
+        path = path / checkpoint_name(name, use_fsdp=use_fsdp)
+
+    if path.exists():
+        return path
+    else:
+        return None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.checkpoint.save_checkpoint"><code class="name flex">
+<span>def <span class="ident">save_checkpoint</span></span>(<span>state: Any, checkpoint_path: pathlib.Path, is_sharded: bool = False) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Save state to disk to the specified checkpoint_path.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -&gt; None:
+    &#34;&#34;&#34;Save state to disk to the specified checkpoint_path.&#34;&#34;&#34;
+    _safe_save_checkpoint(state, checkpoint_path, is_sharded)
+    logger.info(&#34;Checkpoint saved to %s&#34;, checkpoint_path)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.checkpoint.CheckpointSource"><code class="flex name class">
+<span>class <span class="ident">CheckpointSource</span></span>
+<span>(</span><span>value, names=None, *, module=None, qualname=None, type=None, start=1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>An enumeration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CheckpointSource(Enum):
+    CURRENT_XP = &#34;current_xp&#34;
+    PRETRAINED = &#34;pretrained&#34;
+    OTHER = &#34;other&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>enum.Enum</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.checkpoint.CheckpointSource.CURRENT_XP"><code class="name">var <span class="ident">CURRENT_XP</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.checkpoint.CheckpointSource.OTHER"><code class="name">var <span class="ident">OTHER</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.checkpoint.CheckpointSource.PRETRAINED"><code class="name">var <span class="ident">PRETRAINED</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.checkpoint.check_sharded_checkpoint" href="#audiocraft.utils.checkpoint.check_sharded_checkpoint">check_sharded_checkpoint</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.checkpoint_name" href="#audiocraft.utils.checkpoint.checkpoint_name">checkpoint_name</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.flush_stale_checkpoints" href="#audiocraft.utils.checkpoint.flush_stale_checkpoints">flush_stale_checkpoints</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.is_sharded_checkpoint" href="#audiocraft.utils.checkpoint.is_sharded_checkpoint">is_sharded_checkpoint</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.load_checkpoint" href="#audiocraft.utils.checkpoint.load_checkpoint">load_checkpoint</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.resolve_checkpoint_path" href="#audiocraft.utils.checkpoint.resolve_checkpoint_path">resolve_checkpoint_path</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.save_checkpoint" href="#audiocraft.utils.checkpoint.save_checkpoint">save_checkpoint</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.checkpoint.CheckpointSource" href="#audiocraft.utils.checkpoint.CheckpointSource">CheckpointSource</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.checkpoint.CheckpointSource.CURRENT_XP" href="#audiocraft.utils.checkpoint.CheckpointSource.CURRENT_XP">CURRENT_XP</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.CheckpointSource.OTHER" href="#audiocraft.utils.checkpoint.CheckpointSource.OTHER">OTHER</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint.CheckpointSource.PRETRAINED" href="#audiocraft.utils.checkpoint.CheckpointSource.PRETRAINED">PRETRAINED</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/cluster.html b/api_docs/audiocraft/utils/cluster.html
new file mode 100644
index 00000000..c58f030d
--- /dev/null
+++ b/api_docs/audiocraft/utils/cluster.html
@@ -0,0 +1,257 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.cluster API documentation</title>
+<meta name="description" content="Utility functions for SLURM configuration and cluster settings." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.cluster</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility functions for SLURM configuration and cluster settings.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility functions for SLURM configuration and cluster settings.
+&#34;&#34;&#34;
+
+from enum import Enum
+import os
+import socket
+import typing as tp
+
+import omegaconf
+
+
+class ClusterType(Enum):
+    AWS = &#34;aws&#34;
+    FAIR = &#34;fair&#34;
+    RSC = &#34;rsc&#34;
+    LOCAL_DARWIN = &#34;darwin&#34;
+    DEFAULT = &#34;default&#34;  # used for any other cluster.
+
+
+def _guess_cluster_type() -&gt; ClusterType:
+    uname = os.uname()
+    fqdn = socket.getfqdn()
+    if uname.sysname == &#34;Linux&#34; and (uname.release.endswith(&#34;-aws&#34;) or &#34;.ec2&#34; in fqdn):
+        return ClusterType.AWS
+
+    if fqdn.endswith(&#34;.fair&#34;):
+        return ClusterType.FAIR
+
+    if fqdn.endswith(&#34;.facebook.com&#34;):
+        return ClusterType.RSC
+
+    if uname.sysname == &#34;Darwin&#34;:
+        return ClusterType.LOCAL_DARWIN
+
+    return ClusterType.DEFAULT
+
+
+def get_cluster_type(
+    cluster_type: tp.Optional[ClusterType] = None,
+) -&gt; tp.Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type
+
+
+def get_slurm_parameters(
+    cfg: omegaconf.DictConfig, cluster_type: tp.Optional[ClusterType] = None
+) -&gt; omegaconf.DictConfig:
+    &#34;&#34;&#34;Update SLURM parameters in configuration based on cluster type.
+    If the cluster type is not specify, it infers it automatically.
+    &#34;&#34;&#34;
+    from ..environment import AudioCraftEnvironment
+    cluster_type = get_cluster_type(cluster_type)
+    # apply cluster-specific adjustments
+    if cluster_type == ClusterType.AWS:
+        cfg[&#34;mem_per_gpu&#34;] = None
+        cfg[&#34;constraint&#34;] = None
+        cfg[&#34;setup&#34;] = []
+    elif cluster_type == ClusterType.RSC:
+        cfg[&#34;mem_per_gpu&#34;] = None
+        cfg[&#34;setup&#34;] = []
+        cfg[&#34;constraint&#34;] = None
+        cfg[&#34;partition&#34;] = &#34;learn&#34;
+    slurm_exclude = AudioCraftEnvironment.get_slurm_exclude()
+    if slurm_exclude is not None:
+        cfg[&#34;exclude&#34;] = slurm_exclude
+    return cfg</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.cluster.get_cluster_type"><code class="name flex">
+<span>def <span class="ident">get_cluster_type</span></span>(<span>cluster_type: Optional[<a title="audiocraft.utils.cluster.ClusterType" href="#audiocraft.utils.cluster.ClusterType">ClusterType</a>] = None) ‑> Optional[<a title="audiocraft.utils.cluster.ClusterType" href="#audiocraft.utils.cluster.ClusterType">ClusterType</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_cluster_type(
+    cluster_type: tp.Optional[ClusterType] = None,
+) -&gt; tp.Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.cluster.get_slurm_parameters"><code class="name flex">
+<span>def <span class="ident">get_slurm_parameters</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig, cluster_type: Optional[<a title="audiocraft.utils.cluster.ClusterType" href="#audiocraft.utils.cluster.ClusterType">ClusterType</a>] = None) ‑> omegaconf.dictconfig.DictConfig</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Update SLURM parameters in configuration based on cluster type.
+If the cluster type is not specify, it infers it automatically.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_slurm_parameters(
+    cfg: omegaconf.DictConfig, cluster_type: tp.Optional[ClusterType] = None
+) -&gt; omegaconf.DictConfig:
+    &#34;&#34;&#34;Update SLURM parameters in configuration based on cluster type.
+    If the cluster type is not specify, it infers it automatically.
+    &#34;&#34;&#34;
+    from ..environment import AudioCraftEnvironment
+    cluster_type = get_cluster_type(cluster_type)
+    # apply cluster-specific adjustments
+    if cluster_type == ClusterType.AWS:
+        cfg[&#34;mem_per_gpu&#34;] = None
+        cfg[&#34;constraint&#34;] = None
+        cfg[&#34;setup&#34;] = []
+    elif cluster_type == ClusterType.RSC:
+        cfg[&#34;mem_per_gpu&#34;] = None
+        cfg[&#34;setup&#34;] = []
+        cfg[&#34;constraint&#34;] = None
+        cfg[&#34;partition&#34;] = &#34;learn&#34;
+    slurm_exclude = AudioCraftEnvironment.get_slurm_exclude()
+    if slurm_exclude is not None:
+        cfg[&#34;exclude&#34;] = slurm_exclude
+    return cfg</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.cluster.ClusterType"><code class="flex name class">
+<span>class <span class="ident">ClusterType</span></span>
+<span>(</span><span>value, names=None, *, module=None, qualname=None, type=None, start=1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>An enumeration.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ClusterType(Enum):
+    AWS = &#34;aws&#34;
+    FAIR = &#34;fair&#34;
+    RSC = &#34;rsc&#34;
+    LOCAL_DARWIN = &#34;darwin&#34;
+    DEFAULT = &#34;default&#34;  # used for any other cluster.</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>enum.Enum</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.cluster.ClusterType.AWS"><code class="name">var <span class="ident">AWS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.cluster.ClusterType.DEFAULT"><code class="name">var <span class="ident">DEFAULT</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.cluster.ClusterType.FAIR"><code class="name">var <span class="ident">FAIR</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.cluster.ClusterType.LOCAL_DARWIN"><code class="name">var <span class="ident">LOCAL_DARWIN</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.cluster.ClusterType.RSC"><code class="name">var <span class="ident">RSC</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.cluster.get_cluster_type" href="#audiocraft.utils.cluster.get_cluster_type">get_cluster_type</a></code></li>
+<li><code><a title="audiocraft.utils.cluster.get_slurm_parameters" href="#audiocraft.utils.cluster.get_slurm_parameters">get_slurm_parameters</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.cluster.ClusterType" href="#audiocraft.utils.cluster.ClusterType">ClusterType</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.cluster.ClusterType.AWS" href="#audiocraft.utils.cluster.ClusterType.AWS">AWS</a></code></li>
+<li><code><a title="audiocraft.utils.cluster.ClusterType.DEFAULT" href="#audiocraft.utils.cluster.ClusterType.DEFAULT">DEFAULT</a></code></li>
+<li><code><a title="audiocraft.utils.cluster.ClusterType.FAIR" href="#audiocraft.utils.cluster.ClusterType.FAIR">FAIR</a></code></li>
+<li><code><a title="audiocraft.utils.cluster.ClusterType.LOCAL_DARWIN" href="#audiocraft.utils.cluster.ClusterType.LOCAL_DARWIN">LOCAL_DARWIN</a></code></li>
+<li><code><a title="audiocraft.utils.cluster.ClusterType.RSC" href="#audiocraft.utils.cluster.ClusterType.RSC">RSC</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/deadlock.html b/api_docs/audiocraft/utils/deadlock.html
new file mode 100644
index 00000000..d09ffc66
--- /dev/null
+++ b/api_docs/audiocraft/utils/deadlock.html
@@ -0,0 +1,199 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.deadlock API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.deadlock</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from queue import Queue, Empty
+import signal
+import sys
+import threading
+import traceback
+
+logger = logging.getLogger(__name__)
+
+
+class DeadlockDetect:
+    def __init__(self, use: bool = False, timeout: float = 120.):
+        self.use = use
+        self.timeout = timeout
+        self._queue: Queue = Queue()
+
+    def update(self, stage: str):
+        if self.use:
+            self._queue.put(stage)
+
+    def __enter__(self):
+        if self.use:
+            self._thread = threading.Thread(target=self._detector_thread)
+            self._thread.start()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.use:
+            self._queue.put(None)
+            self._thread.join()
+
+    def _detector_thread(self):
+        logger.debug(&#34;Deadlock detector started&#34;)
+        last_stage = &#34;init&#34;
+        while True:
+            try:
+                stage = self._queue.get(timeout=self.timeout)
+            except Empty:
+                break
+            if stage is None:
+                logger.debug(&#34;Exiting deadlock detector thread&#34;)
+                return
+            else:
+                last_stage = stage
+        logger.error(&#34;Deadlock detector timed out, last stage was %s&#34;, last_stage)
+        for th in threading.enumerate():
+            print(th, file=sys.stderr)
+            traceback.print_stack(sys._current_frames()[th.ident])
+            print(file=sys.stderr)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os.kill(os.getpid(), signal.SIGKILL)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.deadlock.DeadlockDetect"><code class="flex name class">
+<span>class <span class="ident">DeadlockDetect</span></span>
+<span>(</span><span>use: bool = False, timeout: float = 120.0)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DeadlockDetect:
+    def __init__(self, use: bool = False, timeout: float = 120.):
+        self.use = use
+        self.timeout = timeout
+        self._queue: Queue = Queue()
+
+    def update(self, stage: str):
+        if self.use:
+            self._queue.put(stage)
+
+    def __enter__(self):
+        if self.use:
+            self._thread = threading.Thread(target=self._detector_thread)
+            self._thread.start()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.use:
+            self._queue.put(None)
+            self._thread.join()
+
+    def _detector_thread(self):
+        logger.debug(&#34;Deadlock detector started&#34;)
+        last_stage = &#34;init&#34;
+        while True:
+            try:
+                stage = self._queue.get(timeout=self.timeout)
+            except Empty:
+                break
+            if stage is None:
+                logger.debug(&#34;Exiting deadlock detector thread&#34;)
+                return
+            else:
+                last_stage = stage
+        logger.error(&#34;Deadlock detector timed out, last stage was %s&#34;, last_stage)
+        for th in threading.enumerate():
+            print(th, file=sys.stderr)
+            traceback.print_stack(sys._current_frames()[th.ident])
+            print(file=sys.stderr)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os.kill(os.getpid(), signal.SIGKILL)</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.deadlock.DeadlockDetect.update"><code class="name flex">
+<span>def <span class="ident">update</span></span>(<span>self, stage: str)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def update(self, stage: str):
+    if self.use:
+        self._queue.put(stage)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.deadlock.DeadlockDetect" href="#audiocraft.utils.deadlock.DeadlockDetect">DeadlockDetect</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.deadlock.DeadlockDetect.update" href="#audiocraft.utils.deadlock.DeadlockDetect.update">update</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/export.html b/api_docs/audiocraft/utils/export.html
new file mode 100644
index 00000000..8560cfe3
--- /dev/null
+++ b/api_docs/audiocraft/utils/export.html
@@ -0,0 +1,243 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.export API documentation</title>
+<meta name="description" content="Utility to export a training checkpoint to a lightweight release checkpoint." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.export</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility to export a training checkpoint to a lightweight release checkpoint.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility to export a training checkpoint to a lightweight release checkpoint.
+&#34;&#34;&#34;
+
+from pathlib import Path
+import typing as tp
+
+from omegaconf import OmegaConf
+import torch
+
+from audiocraft import __version__
+
+
+def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export only the best state from the given EnCodec checkpoint. This
+    should be used if you trained your own EnCodec model.
+    &#34;&#34;&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;best_state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file
+
+
+def export_pretrained_compression_model(pretrained_encodec: str, out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export a compression model (potentially EnCodec) from a pretrained model.
+    This is required for packaging the audio tokenizer along a MusicGen or AudioGen model.
+    Do not include the //pretrained/ prefix. For instance if you trained a model
+    with `facebook/encodec_32khz`, just put that as a name. Same for `dac_44khz`.
+
+    In that case, this will not actually include a copy of the model, simply the reference
+    to the model used.
+    &#34;&#34;&#34;
+    if Path(pretrained_encodec).exists():
+        pkg = torch.load(pretrained_encodec)
+        assert &#39;best_state&#39; in pkg
+        assert &#39;xp.cfg&#39; in pkg
+        assert &#39;version&#39; in pkg
+        assert &#39;exported&#39; in pkg
+    else:
+        pkg = {
+            &#39;pretrained&#39;: pretrained_encodec,
+            &#39;exported&#39;: True,
+            &#39;version&#39;: __version__,
+        }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(pkg, out_file)
+
+
+def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export only the best state from the given MusicGen or AudioGen checkpoint.
+    &#34;&#34;&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    if pkg[&#39;fsdp_best_state&#39;]:
+        best_state = pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;]
+    else:
+        assert pkg[&#39;best_state&#39;]
+        best_state = pkg[&#39;best_state&#39;][&#39;model&#39;]
+    new_pkg = {
+        &#39;best_state&#39;: best_state,
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.export.export_encodec"><code class="name flex">
+<span>def <span class="ident">export_encodec</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_file: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Export only the best state from the given EnCodec checkpoint. This
+should be used if you trained your own EnCodec model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export only the best state from the given EnCodec checkpoint. This
+    should be used if you trained your own EnCodec model.
+    &#34;&#34;&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;best_state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.export.export_lm"><code class="name flex">
+<span>def <span class="ident">export_lm</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_file: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Export only the best state from the given MusicGen or AudioGen checkpoint.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export only the best state from the given MusicGen or AudioGen checkpoint.
+    &#34;&#34;&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    if pkg[&#39;fsdp_best_state&#39;]:
+        best_state = pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;]
+    else:
+        assert pkg[&#39;best_state&#39;]
+        best_state = pkg[&#39;best_state&#39;][&#39;model&#39;]
+    new_pkg = {
+        &#39;best_state&#39;: best_state,
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.export.export_pretrained_compression_model"><code class="name flex">
+<span>def <span class="ident">export_pretrained_compression_model</span></span>(<span>pretrained_encodec: str, out_file: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Export a compression model (potentially EnCodec) from a pretrained model.
+This is required for packaging the audio tokenizer along a MusicGen or AudioGen model.
+Do not include the //pretrained/ prefix. For instance if you trained a model
+with <code>facebook/encodec_32khz</code>, just put that as a name. Same for <code>dac_44khz</code>.</p>
+<p>In that case, this will not actually include a copy of the model, simply the reference
+to the model used.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_pretrained_compression_model(pretrained_encodec: str, out_file: tp.Union[Path, str]):
+    &#34;&#34;&#34;Export a compression model (potentially EnCodec) from a pretrained model.
+    This is required for packaging the audio tokenizer along a MusicGen or AudioGen model.
+    Do not include the //pretrained/ prefix. For instance if you trained a model
+    with `facebook/encodec_32khz`, just put that as a name. Same for `dac_44khz`.
+
+    In that case, this will not actually include a copy of the model, simply the reference
+    to the model used.
+    &#34;&#34;&#34;
+    if Path(pretrained_encodec).exists():
+        pkg = torch.load(pretrained_encodec)
+        assert &#39;best_state&#39; in pkg
+        assert &#39;xp.cfg&#39; in pkg
+        assert &#39;version&#39; in pkg
+        assert &#39;exported&#39; in pkg
+    else:
+        pkg = {
+            &#39;pretrained&#39;: pretrained_encodec,
+            &#39;exported&#39;: True,
+            &#39;version&#39;: __version__,
+        }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(pkg, out_file)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.export.export_encodec" href="#audiocraft.utils.export.export_encodec">export_encodec</a></code></li>
+<li><code><a title="audiocraft.utils.export.export_lm" href="#audiocraft.utils.export.export_lm">export_lm</a></code></li>
+<li><code><a title="audiocraft.utils.export.export_pretrained_compression_model" href="#audiocraft.utils.export.export_pretrained_compression_model">export_pretrained_compression_model</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/export_legacy.html b/api_docs/audiocraft/utils/export_legacy.html
new file mode 100644
index 00000000..699a42ce
--- /dev/null
+++ b/api_docs/audiocraft/utils/export_legacy.html
@@ -0,0 +1,188 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.export_legacy API documentation</title>
+<meta name="description" content="Legacy functions used at the time of the first release, kept for referencd." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.export_legacy</code></h1>
+</header>
+<section id="section-intro">
+<p>Legacy functions used at the time of the first release, kept for referencd.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Legacy functions used at the time of the first release, kept for referencd.
+&#34;&#34;&#34;
+
+from pathlib import Path
+import typing as tp
+
+from omegaconf import OmegaConf, DictConfig
+import torch
+
+from audiocraft import __version__
+
+
+def _clean_lm_cfg(cfg: DictConfig):
+    OmegaConf.set_struct(cfg, False)
+    # This used to be set automatically in the LM solver, need a more robust solution
+    # for the future.
+    cfg[&#39;transformer_lm&#39;][&#39;card&#39;] = 2048
+    n_q = 4
+    stereo_cfg = getattr(cfg, &#39;interleave_stereo_codebooks&#39;, None)
+    if stereo_cfg is not None and stereo_cfg.use:
+        if &#39;downsample&#39; in stereo_cfg:
+            del stereo_cfg[&#39;downsample&#39;]
+        n_q = 8
+    cfg[&#39;transformer_lm&#39;][&#39;n_q&#39;] = n_q
+    # Experimental params no longer supported.
+    bad_params = [&#39;spectral_norm_attn_iters&#39;, &#39;spectral_norm_ff_iters&#39;,
+                  &#39;residual_balancer_attn&#39;, &#39;residual_balancer_ff&#39;, &#39;layer_drop&#39;]
+    for name in bad_params:
+        del cfg[&#39;transformer_lm&#39;][name]
+    OmegaConf.set_struct(cfg, True)
+    return cfg
+
+
+def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;ema&#39;][&#39;state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        # The following params were NOT exported for the first release of MusicGen.
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file
+
+
+def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    if pkg[&#39;fsdp_best_state&#39;]:
+        best_state = pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;]
+    else:
+        best_state = pkg[&#39;best_state&#39;][&#39;model&#39;]
+    new_pkg = {
+        &#39;best_state&#39;: best_state,
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(_clean_lm_cfg(pkg[&#39;xp.cfg&#39;])),
+        # The following params were NOT exported for the first release of MusicGen.
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.export_legacy.export_encodec"><code class="name flex">
+<span>def <span class="ident">export_encodec</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_file: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;ema&#39;][&#39;state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+        # The following params were NOT exported for the first release of MusicGen.
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.export_legacy.export_lm"><code class="name flex">
+<span>def <span class="ident">export_lm</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_file: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    if pkg[&#39;fsdp_best_state&#39;]:
+        best_state = pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;]
+    else:
+        best_state = pkg[&#39;best_state&#39;][&#39;model&#39;]
+    new_pkg = {
+        &#39;best_state&#39;: best_state,
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(_clean_lm_cfg(pkg[&#39;xp.cfg&#39;])),
+        # The following params were NOT exported for the first release of MusicGen.
+        &#39;version&#39;: __version__,
+        &#39;exported&#39;: True,
+    }
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.export_legacy.export_encodec" href="#audiocraft.utils.export_legacy.export_encodec">export_encodec</a></code></li>
+<li><code><a title="audiocraft.utils.export_legacy.export_lm" href="#audiocraft.utils.export_legacy.export_lm">export_lm</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/index.html b/api_docs/audiocraft/utils/index.html
new file mode 100644
index 00000000..f5ce143a
--- /dev/null
+++ b/api_docs/audiocraft/utils/index.html
@@ -0,0 +1,132 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils API documentation</title>
+<meta name="description" content="Utilities." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils</code></h1>
+</header>
+<section id="section-intro">
+<p>Utilities.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+&#34;&#34;&#34;Utilities.&#34;&#34;&#34;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.utils.autocast" href="autocast.html">audiocraft.utils.autocast</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.best_state" href="best_state.html">audiocraft.utils.best_state</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.cache" href="cache.html">audiocraft.utils.cache</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.checkpoint" href="checkpoint.html">audiocraft.utils.checkpoint</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.cluster" href="cluster.html">audiocraft.utils.cluster</a></code></dt>
+<dd>
+<div class="desc"><p>Utility functions for SLURM configuration and cluster settings.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.deadlock" href="deadlock.html">audiocraft.utils.deadlock</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.export" href="export.html">audiocraft.utils.export</a></code></dt>
+<dd>
+<div class="desc"><p>Utility to export a training checkpoint to a lightweight release checkpoint.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.export_legacy" href="export_legacy.html">audiocraft.utils.export_legacy</a></code></dt>
+<dd>
+<div class="desc"><p>Legacy functions used at the time of the first release, kept for referencd.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.notebook" href="notebook.html">audiocraft.utils.notebook</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.profiler" href="profiler.html">audiocraft.utils.profiler</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.samples" href="samples/index.html">audiocraft.utils.samples</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.utils" href="utils.html">audiocraft.utils.utils</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.utils.autocast" href="autocast.html">audiocraft.utils.autocast</a></code></li>
+<li><code><a title="audiocraft.utils.best_state" href="best_state.html">audiocraft.utils.best_state</a></code></li>
+<li><code><a title="audiocraft.utils.cache" href="cache.html">audiocraft.utils.cache</a></code></li>
+<li><code><a title="audiocraft.utils.checkpoint" href="checkpoint.html">audiocraft.utils.checkpoint</a></code></li>
+<li><code><a title="audiocraft.utils.cluster" href="cluster.html">audiocraft.utils.cluster</a></code></li>
+<li><code><a title="audiocraft.utils.deadlock" href="deadlock.html">audiocraft.utils.deadlock</a></code></li>
+<li><code><a title="audiocraft.utils.export" href="export.html">audiocraft.utils.export</a></code></li>
+<li><code><a title="audiocraft.utils.export_legacy" href="export_legacy.html">audiocraft.utils.export_legacy</a></code></li>
+<li><code><a title="audiocraft.utils.notebook" href="notebook.html">audiocraft.utils.notebook</a></code></li>
+<li><code><a title="audiocraft.utils.profiler" href="profiler.html">audiocraft.utils.profiler</a></code></li>
+<li><code><a title="audiocraft.utils.samples" href="samples/index.html">audiocraft.utils.samples</a></code></li>
+<li><code><a title="audiocraft.utils.utils" href="utils.html">audiocraft.utils.utils</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/notebook.html b/api_docs/audiocraft/utils/notebook.html
new file mode 100644
index 00000000..075a78d7
--- /dev/null
+++ b/api_docs/audiocraft/utils/notebook.html
@@ -0,0 +1,133 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.notebook API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.notebook</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    import IPython.display as ipd  # type: ignore
+except ImportError:
+    # Note in a notebook...
+    pass
+
+
+import torch
+
+
+def display_audio(samples: torch.Tensor, sample_rate: int):
+    &#34;&#34;&#34;Renders an audio player for the given audio samples.
+
+    Args:
+        samples (torch.Tensor): a Tensor of decoded audio samples
+            with shapes [B, C, T] or [C, T]
+        sample_rate (int): sample rate audio should be displayed with.
+    &#34;&#34;&#34;
+    assert samples.dim() == 2 or samples.dim() == 3
+
+    samples = samples.detach().cpu()
+    if samples.dim() == 2:
+        samples = samples[None, ...]
+
+    for audio in samples:
+        ipd.display(ipd.Audio(audio, rate=sample_rate))</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.notebook.display_audio"><code class="name flex">
+<span>def <span class="ident">display_audio</span></span>(<span>samples: torch.Tensor, sample_rate: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Renders an audio player for the given audio samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>a Tensor of decoded audio samples
+with shapes [B, C, T] or [C, T]</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate audio should be displayed with.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def display_audio(samples: torch.Tensor, sample_rate: int):
+    &#34;&#34;&#34;Renders an audio player for the given audio samples.
+
+    Args:
+        samples (torch.Tensor): a Tensor of decoded audio samples
+            with shapes [B, C, T] or [C, T]
+        sample_rate (int): sample rate audio should be displayed with.
+    &#34;&#34;&#34;
+    assert samples.dim() == 2 or samples.dim() == 3
+
+    samples = samples.detach().cpu()
+    if samples.dim() == 2:
+        samples = samples[None, ...]
+
+    for audio in samples:
+        ipd.display(ipd.Audio(audio, rate=sample_rate))</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.notebook.display_audio" href="#audiocraft.utils.notebook.display_audio">display_audio</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/profiler.html b/api_docs/audiocraft/utils/profiler.html
new file mode 100644
index 00000000..19067e21
--- /dev/null
+++ b/api_docs/audiocraft/utils/profiler.html
@@ -0,0 +1,160 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.profiler API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.profiler</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import typing as tp
+
+import dora
+import torch
+
+
+logger = logging.getLogger(__name__)
+
+
+class Profiler:
+    &#34;&#34;&#34;Context manager wrapper for xformers profiler.
+    &#34;&#34;&#34;
+    def __init__(self, module: torch.nn.Module, enabled: bool = False):
+        self.profiler: tp.Optional[tp.Any] = None
+        if enabled:
+            from xformers.profiler import profile
+            output_dir = dora.get_xp().folder / &#39;profiler_data&#39;
+            logger.info(&#34;Profiling activated, results with be saved to %s&#34;, output_dir)
+            self.profiler = profile(output_dir=output_dir, module=module)
+
+    def step(self):
+        if self.profiler is not None:
+            self.profiler.step()  # type: ignore
+
+    def __enter__(self):
+        if self.profiler is not None:
+            return self.profiler.__enter__()  # type: ignore
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        if self.profiler is not None:
+            return self.profiler.__exit__(exc_type, exc_value, exc_tb)  # type: ignore</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.profiler.Profiler"><code class="flex name class">
+<span>class <span class="ident">Profiler</span></span>
+<span>(</span><span>module: torch.nn.modules.module.Module, enabled: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Context manager wrapper for xformers profiler.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Profiler:
+    &#34;&#34;&#34;Context manager wrapper for xformers profiler.
+    &#34;&#34;&#34;
+    def __init__(self, module: torch.nn.Module, enabled: bool = False):
+        self.profiler: tp.Optional[tp.Any] = None
+        if enabled:
+            from xformers.profiler import profile
+            output_dir = dora.get_xp().folder / &#39;profiler_data&#39;
+            logger.info(&#34;Profiling activated, results with be saved to %s&#34;, output_dir)
+            self.profiler = profile(output_dir=output_dir, module=module)
+
+    def step(self):
+        if self.profiler is not None:
+            self.profiler.step()  # type: ignore
+
+    def __enter__(self):
+        if self.profiler is not None:
+            return self.profiler.__enter__()  # type: ignore
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        if self.profiler is not None:
+            return self.profiler.__exit__(exc_type, exc_value, exc_tb)  # type: ignore</code></pre>
+</details>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.profiler.Profiler.step"><code class="name flex">
+<span>def <span class="ident">step</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def step(self):
+    if self.profiler is not None:
+        self.profiler.step()  # type: ignore</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.profiler.Profiler" href="#audiocraft.utils.profiler.Profiler">Profiler</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.profiler.Profiler.step" href="#audiocraft.utils.profiler.Profiler.step">step</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/samples/index.html b/api_docs/audiocraft/utils/samples/index.html
new file mode 100644
index 00000000..93a81991
--- /dev/null
+++ b/api_docs/audiocraft/utils/samples/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.samples API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.samples</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.utils.samples.manager" href="manager.html">audiocraft.utils.samples.manager</a></code></dt>
+<dd>
+<div class="desc"><p>API that can manage the storage and retrieval of generated samples produced by experiments …</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="../index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.utils.samples.manager" href="manager.html">audiocraft.utils.samples.manager</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/samples/manager.html b/api_docs/audiocraft/utils/samples/manager.html
new file mode 100644
index 00000000..2eabe288
--- /dev/null
+++ b/api_docs/audiocraft/utils/samples/manager.html
@@ -0,0 +1,1233 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.samples.manager API documentation</title>
+<meta name="description" content="API that can manage the storage and retrieval of generated samples produced by experiments …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.samples.manager</code></h1>
+</header>
+<section id="section-intro">
+<p>API that can manage the storage and retrieval of generated samples produced by experiments.</p>
+<p>It offers the following benefits:
+* Samples are stored in a consistent way across epoch
+* Metadata about the samples can be stored and retrieved
+* Can retrieve audio
+* Identifiers are reliable and deterministic for prompted and conditioned samples
+* Can request the samples for multiple XPs, grouped by sample identifier
+* For no-input samples (not prompt and no conditions), samples across XPs are matched
+by sorting their identifiers</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+API that can manage the storage and retrieval of generated samples produced by experiments.
+
+It offers the following benefits:
+* Samples are stored in a consistent way across epoch
+* Metadata about the samples can be stored and retrieved
+* Can retrieve audio
+* Identifiers are reliable and deterministic for prompted and conditioned samples
+* Can request the samples for multiple XPs, grouped by sample identifier
+* For no-input samples (not prompt and no conditions), samples across XPs are matched
+  by sorting their identifiers
+&#34;&#34;&#34;
+
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict, dataclass
+from functools import lru_cache
+import hashlib
+import json
+import logging
+from pathlib import Path
+import re
+import typing as tp
+import unicodedata
+import uuid
+
+import dora
+import torch
+
+from ...data.audio import audio_read, audio_write
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ReferenceSample:
+    id: str
+    path: str
+    duration: float
+
+
+@dataclass
+class Sample:
+    id: str
+    path: str
+    epoch: int
+    duration: float
+    conditioning: tp.Optional[tp.Dict[str, tp.Any]]
+    prompt: tp.Optional[ReferenceSample]
+    reference: tp.Optional[ReferenceSample]
+    generation_args: tp.Optional[tp.Dict[str, tp.Any]]
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def audio(self) -&gt; tp.Tuple[torch.Tensor, int]:
+        return audio_read(self.path)
+
+    def audio_prompt(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+        return audio_read(self.prompt.path) if self.prompt is not None else None
+
+    def audio_reference(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+        return audio_read(self.reference.path) if self.reference is not None else None
+
+
+class SampleManager:
+    &#34;&#34;&#34;Audio samples IO handling within a given dora xp.
+
+    The sample manager handles the dumping and loading logic for generated and
+    references samples across epochs for a given xp, providing a simple API to
+    store, retrieve and compare audio samples.
+
+    Args:
+        xp (dora.XP): Dora experiment object. The XP contains information on the XP folder
+            where all outputs are stored and the configuration of the experiment,
+            which is useful to retrieve audio-related parameters.
+        map_reference_to_sample_id (bool): Whether to use the sample_id for all reference samples
+            instead of generating a dedicated hash id. This is useful to allow easier comparison
+            with ground truth sample from the files directly without having to read the JSON metadata
+            to do the mapping (at the cost of potentially dumping duplicate prompts/references
+            depending on the task).
+    &#34;&#34;&#34;
+    def __init__(self, xp: dora.XP, map_reference_to_sample_id: bool = False):
+        self.xp = xp
+        self.base_folder: Path = xp.folder / xp.cfg.generate.path
+        self.reference_folder = self.base_folder / &#39;reference&#39;
+        self.map_reference_to_sample_id = map_reference_to_sample_id
+        self.samples: tp.List[Sample] = []
+        self._load_samples()
+
+    @property
+    def latest_epoch(self):
+        &#34;&#34;&#34;Latest epoch across all samples.&#34;&#34;&#34;
+        return max(self.samples, key=lambda x: x.epoch).epoch if self.samples else 0
+
+    def _load_samples(self):
+        &#34;&#34;&#34;Scan the sample folder and load existing samples.&#34;&#34;&#34;
+        jsons = self.base_folder.glob(&#39;**/*.json&#39;)
+        with ThreadPoolExecutor(6) as pool:
+            self.samples = list(pool.map(self._load_sample, jsons))
+
+    @staticmethod
+    @lru_cache(2**26)
+    def _load_sample(json_file: Path) -&gt; Sample:
+        with open(json_file, &#39;r&#39;) as f:
+            data: tp.Dict[str, tp.Any] = json.load(f)
+        # fetch prompt data
+        prompt_data = data.get(&#39;prompt&#39;)
+        prompt = ReferenceSample(id=prompt_data[&#39;id&#39;], path=prompt_data[&#39;path&#39;],
+                                 duration=prompt_data[&#39;duration&#39;]) if prompt_data else None
+        # fetch reference data
+        reference_data = data.get(&#39;reference&#39;)
+        reference = ReferenceSample(id=reference_data[&#39;id&#39;], path=reference_data[&#39;path&#39;],
+                                    duration=reference_data[&#39;duration&#39;]) if reference_data else None
+        # build sample object
+        return Sample(id=data[&#39;id&#39;], path=data[&#39;path&#39;], epoch=data[&#39;epoch&#39;], duration=data[&#39;duration&#39;],
+                      prompt=prompt, conditioning=data.get(&#39;conditioning&#39;), reference=reference,
+                      generation_args=data.get(&#39;generation_args&#39;))
+
+    def _init_hash(self):
+        return hashlib.sha1()
+
+    def _get_tensor_id(self, tensor: torch.Tensor) -&gt; str:
+        hash_id = self._init_hash()
+        hash_id.update(tensor.numpy().data)
+        return hash_id.hexdigest()
+
+    def _get_sample_id(self, index: int, prompt_wav: tp.Optional[torch.Tensor],
+                       conditions: tp.Optional[tp.Dict[str, str]]) -&gt; str:
+        &#34;&#34;&#34;Computes an id for a sample given its input data.
+        This id is deterministic if prompt and/or conditions are provided by using a sha1 hash on the input.
+        Otherwise, a random id of the form &#34;noinput_{uuid4().hex}&#34; is returned.
+
+        Args:
+            index (int): Batch index, Helpful to differentiate samples from the same batch.
+            prompt_wav (torch.Tensor): Prompt used during generation.
+            conditions (dict[str, str]): Conditioning used during generation.
+        &#34;&#34;&#34;
+        # For totally unconditioned generations we will just use a random UUID.
+        # The function get_samples_for_xps will do a simple ordered match with a custom key.
+        if prompt_wav is None and not conditions:
+            return f&#34;noinput_{uuid.uuid4().hex}&#34;
+
+        # Human readable portion
+        hr_label = &#34;&#34;
+        # Create a deterministic id using hashing
+        hash_id = self._init_hash()
+        hash_id.update(f&#34;{index}&#34;.encode())
+        if prompt_wav is not None:
+            hash_id.update(prompt_wav.numpy().data)
+            hr_label += &#34;_prompted&#34;
+        else:
+            hr_label += &#34;_unprompted&#34;
+        if conditions:
+            encoded_json = json.dumps(conditions, sort_keys=True).encode()
+            hash_id.update(encoded_json)
+            cond_str = &#34;-&#34;.join([f&#34;{key}={slugify(value)}&#34;
+                                 for key, value in sorted(conditions.items())])
+            cond_str = cond_str[:100]  # some raw text might be too long to be a valid filename
+            cond_str = cond_str if len(cond_str) &gt; 0 else &#34;unconditioned&#34;
+            hr_label += f&#34;_{cond_str}&#34;
+        else:
+            hr_label += &#34;_unconditioned&#34;
+
+        return hash_id.hexdigest() + hr_label
+
+    def _store_audio(self, wav: torch.Tensor, stem_path: Path, overwrite: bool = False) -&gt; Path:
+        &#34;&#34;&#34;Stores the audio with the given stem path using the XP&#39;s configuration.
+
+        Args:
+            wav (torch.Tensor): Audio to store.
+            stem_path (Path): Path in sample output directory with file stem to use.
+            overwrite (bool): When False (default), skips storing an existing audio file.
+        Returns:
+            Path: The path at which the audio is stored.
+        &#34;&#34;&#34;
+        existing_paths = [
+            path for path in stem_path.parent.glob(stem_path.stem + &#39;.*&#39;)
+            if path.suffix != &#39;.json&#39;
+        ]
+        exists = len(existing_paths) &gt; 0
+        if exists and overwrite:
+            logger.warning(f&#34;Overwriting existing audio file with stem path {stem_path}&#34;)
+        elif exists:
+            return existing_paths[0]
+
+        audio_path = audio_write(stem_path, wav, **self.xp.cfg.generate.audio)
+        return audio_path
+
+    def add_sample(self, sample_wav: torch.Tensor, epoch: int, index: int = 0,
+                   conditions: tp.Optional[tp.Dict[str, str]] = None, prompt_wav: tp.Optional[torch.Tensor] = None,
+                   ground_truth_wav: tp.Optional[torch.Tensor] = None,
+                   generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; Sample:
+        &#34;&#34;&#34;Adds a single sample.
+        The sample is stored in the XP&#39;s sample output directory, under a corresponding epoch folder.
+        Each sample is assigned an id which is computed using the input data. In addition to the
+        sample itself, a json file containing associated metadata is stored next to it.
+
+        Args:
+            sample_wav (torch.Tensor): sample audio to store. Tensor of shape [channels, shape].
+            epoch (int): current training epoch.
+            index (int): helpful to differentiate samples from the same batch.
+            conditions (dict[str, str], optional): conditioning used during generation.
+            prompt_wav (torch.Tensor, optional): prompt used during generation. Tensor of shape [channels, shape].
+            ground_truth_wav (torch.Tensor, optional): reference audio where prompt was extracted from.
+                Tensor of shape [channels, shape].
+            generation_args (dict[str, any], optional): dictionary of other arguments used during generation.
+        Returns:
+            Sample: The saved sample.
+        &#34;&#34;&#34;
+        sample_id = self._get_sample_id(index, prompt_wav, conditions)
+        reuse_id = self.map_reference_to_sample_id
+        prompt, ground_truth = None, None
+        if prompt_wav is not None:
+            prompt_id = sample_id if reuse_id else self._get_tensor_id(prompt_wav.sum(0, keepdim=True))
+            prompt_duration = prompt_wav.shape[-1] / self.xp.cfg.sample_rate
+            prompt_path = self._store_audio(prompt_wav, self.base_folder / str(epoch) / &#39;prompt&#39; / prompt_id)
+            prompt = ReferenceSample(prompt_id, str(prompt_path), prompt_duration)
+        if ground_truth_wav is not None:
+            ground_truth_id = sample_id if reuse_id else self._get_tensor_id(ground_truth_wav.sum(0, keepdim=True))
+            ground_truth_duration = ground_truth_wav.shape[-1] / self.xp.cfg.sample_rate
+            ground_truth_path = self._store_audio(ground_truth_wav, self.base_folder / &#39;reference&#39; / ground_truth_id)
+            ground_truth = ReferenceSample(ground_truth_id, str(ground_truth_path), ground_truth_duration)
+        sample_path = self._store_audio(sample_wav, self.base_folder / str(epoch) / sample_id, overwrite=True)
+        duration = sample_wav.shape[-1] / self.xp.cfg.sample_rate
+        sample = Sample(sample_id, str(sample_path), epoch, duration, conditions, prompt, ground_truth, generation_args)
+        self.samples.append(sample)
+        with open(sample_path.with_suffix(&#39;.json&#39;), &#39;w&#39;) as f:
+            json.dump(asdict(sample), f, indent=2)
+        return sample
+
+    def add_samples(self, samples_wavs: torch.Tensor, epoch: int,
+                    conditioning: tp.Optional[tp.List[tp.Dict[str, tp.Any]]] = None,
+                    prompt_wavs: tp.Optional[torch.Tensor] = None,
+                    ground_truth_wavs: tp.Optional[torch.Tensor] = None,
+                    generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; tp.List[Sample]:
+        &#34;&#34;&#34;Adds a batch of samples.
+        The samples are stored in the XP&#39;s sample output directory, under a corresponding
+        epoch folder. Each sample is assigned an id which is computed using the input data and their batch index.
+        In addition to the sample itself, a json file containing associated metadata is stored next to it.
+
+        Args:
+            sample_wavs (torch.Tensor): Batch of audio wavs to store. Tensor of shape [batch_size, channels, shape].
+            epoch (int): Current training epoch.
+            conditioning (list of dict[str, str], optional): List of conditions used during generation,
+                one per sample in the batch.
+            prompt_wavs (torch.Tensor, optional): Prompts used during generation. Tensor of shape
+                [batch_size, channels, shape].
+            ground_truth_wav (torch.Tensor, optional): Reference audio where prompts were extracted from.
+                Tensor of shape [batch_size, channels, shape].
+            generation_args (dict[str, Any], optional): Dictionary of other arguments used during generation.
+        Returns:
+            samples (list of Sample): The saved audio samples with prompts, ground truth and metadata.
+        &#34;&#34;&#34;
+        samples = []
+        for idx, wav in enumerate(samples_wavs):
+            prompt_wav = prompt_wavs[idx] if prompt_wavs is not None else None
+            gt_wav = ground_truth_wavs[idx] if ground_truth_wavs is not None else None
+            conditions = conditioning[idx] if conditioning is not None else None
+            samples.append(self.add_sample(wav, epoch, idx, conditions, prompt_wav, gt_wav, generation_args))
+        return samples
+
+    def get_samples(self, epoch: int = -1, max_epoch: int = -1, exclude_prompted: bool = False,
+                    exclude_unprompted: bool = False, exclude_conditioned: bool = False,
+                    exclude_unconditioned: bool = False) -&gt; tp.Set[Sample]:
+        &#34;&#34;&#34;Returns a set of samples for this XP. Optionally, you can filter which samples to obtain.
+        Please note that existing samples are loaded during the manager&#39;s initialization, and added samples through this
+        manager are also tracked. Any other external changes are not tracked automatically, so creating a new manager
+        is the only way detect them.
+
+        Args:
+            epoch (int): If provided, only return samples corresponding to this epoch.
+            max_epoch (int): If provided, only return samples corresponding to the latest epoch that is &lt;= max_epoch.
+            exclude_prompted (bool): If True, does not include samples that used a prompt.
+            exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
+            exclude_conditioned (bool): If True, excludes samples that used conditioning.
+            exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
+        Returns:
+            Samples (set of Sample): The retrieved samples matching the provided filters.
+        &#34;&#34;&#34;
+        if max_epoch &gt;= 0:
+            samples_epoch = max(sample.epoch for sample in self.samples if sample.epoch &lt;= max_epoch)
+        else:
+            samples_epoch = self.latest_epoch if epoch &lt; 0 else epoch
+        samples = {
+            sample
+            for sample in self.samples
+            if (
+                (sample.epoch == samples_epoch) and
+                (not exclude_prompted or sample.prompt is None) and
+                (not exclude_unprompted or sample.prompt is not None) and
+                (not exclude_conditioned or not sample.conditioning) and
+                (not exclude_unconditioned or sample.conditioning)
+            )
+        }
+        return samples
+
+
+def slugify(value: tp.Any, allow_unicode: bool = False):
+    &#34;&#34;&#34;Process string for safer file naming.
+
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+
+    Convert to ASCII if &#39;allow_unicode&#39; is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren&#39;t alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    &#34;&#34;&#34;
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize(&#34;NFKC&#34;, value)
+    else:
+        value = (
+            unicodedata.normalize(&#34;NFKD&#34;, value)
+            .encode(&#34;ascii&#34;, &#34;ignore&#34;)
+            .decode(&#34;ascii&#34;)
+        )
+    value = re.sub(r&#34;[^\w\s-]&#34;, &#34;&#34;, value.lower())
+    return re.sub(r&#34;[-\s]+&#34;, &#34;-&#34;, value).strip(&#34;-_&#34;)
+
+
+def _match_stable_samples(samples_per_xp: tp.List[tp.Set[Sample]]) -&gt; tp.Dict[str, tp.List[Sample]]:
+    # Create a dictionary of stable id -&gt; sample per XP
+    stable_samples_per_xp = [{
+        sample.id: sample for sample in samples
+        if sample.prompt is not None or sample.conditioning
+    } for samples in samples_per_xp]
+    # Set of all stable ids
+    stable_ids = {id for samples in stable_samples_per_xp for id in samples.keys()}
+    # Dictionary of stable id -&gt; list of samples. If an XP does not have it, assign None
+    stable_samples = {id: [xp.get(id) for xp in stable_samples_per_xp] for id in stable_ids}
+    # Filter out ids that contain None values (we only want matched samples after all)
+    # cast is necessary to avoid mypy linter errors.
+    return {id: tp.cast(tp.List[Sample], samples) for id, samples in stable_samples.items() if None not in samples}
+
+
+def _match_unstable_samples(samples_per_xp: tp.List[tp.Set[Sample]]) -&gt; tp.Dict[str, tp.List[Sample]]:
+    # For unstable ids, we use a sorted list since we&#39;ll match them in order
+    unstable_samples_per_xp = [[
+        sample for sample in sorted(samples, key=lambda x: x.id)
+        if sample.prompt is None and not sample.conditioning
+    ] for samples in samples_per_xp]
+    # Trim samples per xp so all samples can have a match
+    min_len = min([len(samples) for samples in unstable_samples_per_xp])
+    unstable_samples_per_xp = [samples[:min_len] for samples in unstable_samples_per_xp]
+    # Dictionary of index -&gt; list of matched samples
+    return {
+        f&#39;noinput_{i}&#39;: [samples[i] for samples in unstable_samples_per_xp] for i in range(min_len)
+    }
+
+
+def get_samples_for_xps(xps: tp.List[dora.XP], **kwargs) -&gt; tp.Dict[str, tp.List[Sample]]:
+    &#34;&#34;&#34;Gets a dictionary of matched samples across the given XPs.
+    Each dictionary entry maps a sample id to a list of samples for that id. The number of samples per id
+    will always match the number of XPs provided and will correspond to each XP in the same order given.
+    In other words, only samples that can be match across all provided XPs will be returned
+    in order to satisfy this rule.
+
+    There are two types of ids that can be returned: stable and unstable.
+    * Stable IDs are deterministic ids that were computed by the SampleManager given a sample&#39;s inputs
+      (prompts/conditioning). This is why we can match them across XPs.
+    * Unstable IDs are of the form &#34;noinput_{idx}&#34; and are generated on-the-fly, in order to map samples
+      that used non-deterministic, random ids. This is the case for samples that did not use prompts or
+      conditioning for their generation. This function will sort these samples by their id and match them
+      by their index.
+
+    Args:
+        xps: a list of XPs to match samples from.
+        start_epoch (int): If provided, only return samples corresponding to this epoch or newer.
+        end_epoch (int): If provided, only return samples corresponding to this epoch or older.
+        exclude_prompted (bool): If True, does not include samples that used a prompt.
+        exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
+        exclude_conditioned (bool): If True, excludes samples that used conditioning.
+        exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
+    &#34;&#34;&#34;
+    managers = [SampleManager(xp) for xp in xps]
+    samples_per_xp = [manager.get_samples(**kwargs) for manager in managers]
+    stable_samples = _match_stable_samples(samples_per_xp)
+    unstable_samples = _match_unstable_samples(samples_per_xp)
+    return dict(stable_samples, **unstable_samples)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.samples.manager.get_samples_for_xps"><code class="name flex">
+<span>def <span class="ident">get_samples_for_xps</span></span>(<span>xps: List[dora.xp.XP], **kwargs) ‑> Dict[str, List[<a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a>]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets a dictionary of matched samples across the given XPs.
+Each dictionary entry maps a sample id to a list of samples for that id. The number of samples per id
+will always match the number of XPs provided and will correspond to each XP in the same order given.
+In other words, only samples that can be match across all provided XPs will be returned
+in order to satisfy this rule.</p>
+<p>There are two types of ids that can be returned: stable and unstable.
+* Stable IDs are deterministic ids that were computed by the SampleManager given a sample's inputs
+(prompts/conditioning). This is why we can match them across XPs.
+* Unstable IDs are of the form "noinput_{idx}" and are generated on-the-fly, in order to map samples
+that used non-deterministic, random ids. This is the case for samples that did not use prompts or
+conditioning for their generation. This function will sort these samples by their id and match them
+by their index.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>xps</code></strong></dt>
+<dd>a list of XPs to match samples from.</dd>
+<dt><strong><code>start_epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>If provided, only return samples corresponding to this epoch or newer.</dd>
+<dt><strong><code>end_epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>If provided, only return samples corresponding to this epoch or older.</dd>
+<dt><strong><code>exclude_prompted</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, does not include samples that used a prompt.</dd>
+<dt><strong><code>exclude_unprompted</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, does not include samples that did not use a prompt.</dd>
+<dt><strong><code>exclude_conditioned</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, excludes samples that used conditioning.</dd>
+<dt><strong><code>exclude_unconditioned</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, excludes samples that did not use conditioning.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_samples_for_xps(xps: tp.List[dora.XP], **kwargs) -&gt; tp.Dict[str, tp.List[Sample]]:
+    &#34;&#34;&#34;Gets a dictionary of matched samples across the given XPs.
+    Each dictionary entry maps a sample id to a list of samples for that id. The number of samples per id
+    will always match the number of XPs provided and will correspond to each XP in the same order given.
+    In other words, only samples that can be match across all provided XPs will be returned
+    in order to satisfy this rule.
+
+    There are two types of ids that can be returned: stable and unstable.
+    * Stable IDs are deterministic ids that were computed by the SampleManager given a sample&#39;s inputs
+      (prompts/conditioning). This is why we can match them across XPs.
+    * Unstable IDs are of the form &#34;noinput_{idx}&#34; and are generated on-the-fly, in order to map samples
+      that used non-deterministic, random ids. This is the case for samples that did not use prompts or
+      conditioning for their generation. This function will sort these samples by their id and match them
+      by their index.
+
+    Args:
+        xps: a list of XPs to match samples from.
+        start_epoch (int): If provided, only return samples corresponding to this epoch or newer.
+        end_epoch (int): If provided, only return samples corresponding to this epoch or older.
+        exclude_prompted (bool): If True, does not include samples that used a prompt.
+        exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
+        exclude_conditioned (bool): If True, excludes samples that used conditioning.
+        exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
+    &#34;&#34;&#34;
+    managers = [SampleManager(xp) for xp in xps]
+    samples_per_xp = [manager.get_samples(**kwargs) for manager in managers]
+    stable_samples = _match_stable_samples(samples_per_xp)
+    unstable_samples = _match_unstable_samples(samples_per_xp)
+    return dict(stable_samples, **unstable_samples)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.samples.manager.slugify"><code class="name flex">
+<span>def <span class="ident">slugify</span></span>(<span>value: Any, allow_unicode: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Process string for safer file naming.</p>
+<p>Taken from <a href="https://github.com/django/django/blob/master/django/utils/text.py">https://github.com/django/django/blob/master/django/utils/text.py</a></p>
+<p>Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+dashes to single dashes. Remove characters that aren't alphanumerics,
+underscores, or hyphens. Convert to lowercase. Also strip leading and
+trailing whitespace, dashes, and underscores.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def slugify(value: tp.Any, allow_unicode: bool = False):
+    &#34;&#34;&#34;Process string for safer file naming.
+
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+
+    Convert to ASCII if &#39;allow_unicode&#39; is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren&#39;t alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    &#34;&#34;&#34;
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize(&#34;NFKC&#34;, value)
+    else:
+        value = (
+            unicodedata.normalize(&#34;NFKD&#34;, value)
+            .encode(&#34;ascii&#34;, &#34;ignore&#34;)
+            .decode(&#34;ascii&#34;)
+        )
+    value = re.sub(r&#34;[^\w\s-]&#34;, &#34;&#34;, value.lower())
+    return re.sub(r&#34;[-\s]+&#34;, &#34;-&#34;, value).strip(&#34;-_&#34;)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.samples.manager.ReferenceSample"><code class="flex name class">
+<span>class <span class="ident">ReferenceSample</span></span>
+<span>(</span><span>id: str, path: str, duration: float)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ReferenceSample(id: str, path: str, duration: float)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ReferenceSample:
+    id: str
+    path: str
+    duration: float</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.samples.manager.ReferenceSample.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.ReferenceSample.id"><code class="name">var <span class="ident">id</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.ReferenceSample.path"><code class="name">var <span class="ident">path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample"><code class="flex name class">
+<span>class <span class="ident">Sample</span></span>
+<span>(</span><span>id: str, path: str, epoch: int, duration: float, conditioning: Optional[Dict[str, Any]], prompt: Optional[<a title="audiocraft.utils.samples.manager.ReferenceSample" href="#audiocraft.utils.samples.manager.ReferenceSample">ReferenceSample</a>], reference: Optional[<a title="audiocraft.utils.samples.manager.ReferenceSample" href="#audiocraft.utils.samples.manager.ReferenceSample">ReferenceSample</a>], generation_args: Optional[Dict[str, Any]])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample(id: str, path: str, epoch: int, duration: float, conditioning: Union[Dict[str, Any], NoneType], prompt: Union[audiocraft.utils.samples.manager.ReferenceSample, NoneType], reference: Union[audiocraft.utils.samples.manager.ReferenceSample, NoneType], generation_args: Union[Dict[str, Any], NoneType])</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Sample:
+    id: str
+    path: str
+    epoch: int
+    duration: float
+    conditioning: tp.Optional[tp.Dict[str, tp.Any]]
+    prompt: tp.Optional[ReferenceSample]
+    reference: tp.Optional[ReferenceSample]
+    generation_args: tp.Optional[tp.Dict[str, tp.Any]]
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def audio(self) -&gt; tp.Tuple[torch.Tensor, int]:
+        return audio_read(self.path)
+
+    def audio_prompt(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+        return audio_read(self.prompt.path) if self.prompt is not None else None
+
+    def audio_reference(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+        return audio_read(self.reference.path) if self.reference is not None else None</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.samples.manager.Sample.conditioning"><code class="name">var <span class="ident">conditioning</span> : Optional[Dict[str, Any]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.epoch"><code class="name">var <span class="ident">epoch</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.generation_args"><code class="name">var <span class="ident">generation_args</span> : Optional[Dict[str, Any]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.id"><code class="name">var <span class="ident">id</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.path"><code class="name">var <span class="ident">path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.prompt"><code class="name">var <span class="ident">prompt</span> : Optional[<a title="audiocraft.utils.samples.manager.ReferenceSample" href="#audiocraft.utils.samples.manager.ReferenceSample">ReferenceSample</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.reference"><code class="name">var <span class="ident">reference</span> : Optional[<a title="audiocraft.utils.samples.manager.ReferenceSample" href="#audiocraft.utils.samples.manager.ReferenceSample">ReferenceSample</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.samples.manager.Sample.audio"><code class="name flex">
+<span>def <span class="ident">audio</span></span>(<span>self) ‑> Tuple[torch.Tensor, int]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio(self) -&gt; tp.Tuple[torch.Tensor, int]:
+    return audio_read(self.path)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.audio_prompt"><code class="name flex">
+<span>def <span class="ident">audio_prompt</span></span>(<span>self) ‑> Optional[Tuple[torch.Tensor, int]]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_prompt(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+    return audio_read(self.prompt.path) if self.prompt is not None else None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.samples.manager.Sample.audio_reference"><code class="name flex">
+<span>def <span class="ident">audio_reference</span></span>(<span>self) ‑> Optional[Tuple[torch.Tensor, int]]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_reference(self) -&gt; tp.Optional[tp.Tuple[torch.Tensor, int]]:
+    return audio_read(self.reference.path) if self.reference is not None else None</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.utils.samples.manager.SampleManager"><code class="flex name class">
+<span>class <span class="ident">SampleManager</span></span>
+<span>(</span><span>xp: dora.xp.XP, map_reference_to_sample_id: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Audio samples IO handling within a given dora xp.</p>
+<p>The sample manager handles the dumping and loading logic for generated and
+references samples across epochs for a given xp, providing a simple API to
+store, retrieve and compare audio samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>xp</code></strong> :&ensp;<code>dora.XP</code></dt>
+<dd>Dora experiment object. The XP contains information on the XP folder
+where all outputs are stored and the configuration of the experiment,
+which is useful to retrieve audio-related parameters.</dd>
+<dt><strong><code>map_reference_to_sample_id</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use the sample_id for all reference samples
+instead of generating a dedicated hash id. This is useful to allow easier comparison
+with ground truth sample from the files directly without having to read the JSON metadata
+to do the mapping (at the cost of potentially dumping duplicate prompts/references
+depending on the task).</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SampleManager:
+    &#34;&#34;&#34;Audio samples IO handling within a given dora xp.
+
+    The sample manager handles the dumping and loading logic for generated and
+    references samples across epochs for a given xp, providing a simple API to
+    store, retrieve and compare audio samples.
+
+    Args:
+        xp (dora.XP): Dora experiment object. The XP contains information on the XP folder
+            where all outputs are stored and the configuration of the experiment,
+            which is useful to retrieve audio-related parameters.
+        map_reference_to_sample_id (bool): Whether to use the sample_id for all reference samples
+            instead of generating a dedicated hash id. This is useful to allow easier comparison
+            with ground truth sample from the files directly without having to read the JSON metadata
+            to do the mapping (at the cost of potentially dumping duplicate prompts/references
+            depending on the task).
+    &#34;&#34;&#34;
+    def __init__(self, xp: dora.XP, map_reference_to_sample_id: bool = False):
+        self.xp = xp
+        self.base_folder: Path = xp.folder / xp.cfg.generate.path
+        self.reference_folder = self.base_folder / &#39;reference&#39;
+        self.map_reference_to_sample_id = map_reference_to_sample_id
+        self.samples: tp.List[Sample] = []
+        self._load_samples()
+
+    @property
+    def latest_epoch(self):
+        &#34;&#34;&#34;Latest epoch across all samples.&#34;&#34;&#34;
+        return max(self.samples, key=lambda x: x.epoch).epoch if self.samples else 0
+
+    def _load_samples(self):
+        &#34;&#34;&#34;Scan the sample folder and load existing samples.&#34;&#34;&#34;
+        jsons = self.base_folder.glob(&#39;**/*.json&#39;)
+        with ThreadPoolExecutor(6) as pool:
+            self.samples = list(pool.map(self._load_sample, jsons))
+
+    @staticmethod
+    @lru_cache(2**26)
+    def _load_sample(json_file: Path) -&gt; Sample:
+        with open(json_file, &#39;r&#39;) as f:
+            data: tp.Dict[str, tp.Any] = json.load(f)
+        # fetch prompt data
+        prompt_data = data.get(&#39;prompt&#39;)
+        prompt = ReferenceSample(id=prompt_data[&#39;id&#39;], path=prompt_data[&#39;path&#39;],
+                                 duration=prompt_data[&#39;duration&#39;]) if prompt_data else None
+        # fetch reference data
+        reference_data = data.get(&#39;reference&#39;)
+        reference = ReferenceSample(id=reference_data[&#39;id&#39;], path=reference_data[&#39;path&#39;],
+                                    duration=reference_data[&#39;duration&#39;]) if reference_data else None
+        # build sample object
+        return Sample(id=data[&#39;id&#39;], path=data[&#39;path&#39;], epoch=data[&#39;epoch&#39;], duration=data[&#39;duration&#39;],
+                      prompt=prompt, conditioning=data.get(&#39;conditioning&#39;), reference=reference,
+                      generation_args=data.get(&#39;generation_args&#39;))
+
+    def _init_hash(self):
+        return hashlib.sha1()
+
+    def _get_tensor_id(self, tensor: torch.Tensor) -&gt; str:
+        hash_id = self._init_hash()
+        hash_id.update(tensor.numpy().data)
+        return hash_id.hexdigest()
+
+    def _get_sample_id(self, index: int, prompt_wav: tp.Optional[torch.Tensor],
+                       conditions: tp.Optional[tp.Dict[str, str]]) -&gt; str:
+        &#34;&#34;&#34;Computes an id for a sample given its input data.
+        This id is deterministic if prompt and/or conditions are provided by using a sha1 hash on the input.
+        Otherwise, a random id of the form &#34;noinput_{uuid4().hex}&#34; is returned.
+
+        Args:
+            index (int): Batch index, Helpful to differentiate samples from the same batch.
+            prompt_wav (torch.Tensor): Prompt used during generation.
+            conditions (dict[str, str]): Conditioning used during generation.
+        &#34;&#34;&#34;
+        # For totally unconditioned generations we will just use a random UUID.
+        # The function get_samples_for_xps will do a simple ordered match with a custom key.
+        if prompt_wav is None and not conditions:
+            return f&#34;noinput_{uuid.uuid4().hex}&#34;
+
+        # Human readable portion
+        hr_label = &#34;&#34;
+        # Create a deterministic id using hashing
+        hash_id = self._init_hash()
+        hash_id.update(f&#34;{index}&#34;.encode())
+        if prompt_wav is not None:
+            hash_id.update(prompt_wav.numpy().data)
+            hr_label += &#34;_prompted&#34;
+        else:
+            hr_label += &#34;_unprompted&#34;
+        if conditions:
+            encoded_json = json.dumps(conditions, sort_keys=True).encode()
+            hash_id.update(encoded_json)
+            cond_str = &#34;-&#34;.join([f&#34;{key}={slugify(value)}&#34;
+                                 for key, value in sorted(conditions.items())])
+            cond_str = cond_str[:100]  # some raw text might be too long to be a valid filename
+            cond_str = cond_str if len(cond_str) &gt; 0 else &#34;unconditioned&#34;
+            hr_label += f&#34;_{cond_str}&#34;
+        else:
+            hr_label += &#34;_unconditioned&#34;
+
+        return hash_id.hexdigest() + hr_label
+
+    def _store_audio(self, wav: torch.Tensor, stem_path: Path, overwrite: bool = False) -&gt; Path:
+        &#34;&#34;&#34;Stores the audio with the given stem path using the XP&#39;s configuration.
+
+        Args:
+            wav (torch.Tensor): Audio to store.
+            stem_path (Path): Path in sample output directory with file stem to use.
+            overwrite (bool): When False (default), skips storing an existing audio file.
+        Returns:
+            Path: The path at which the audio is stored.
+        &#34;&#34;&#34;
+        existing_paths = [
+            path for path in stem_path.parent.glob(stem_path.stem + &#39;.*&#39;)
+            if path.suffix != &#39;.json&#39;
+        ]
+        exists = len(existing_paths) &gt; 0
+        if exists and overwrite:
+            logger.warning(f&#34;Overwriting existing audio file with stem path {stem_path}&#34;)
+        elif exists:
+            return existing_paths[0]
+
+        audio_path = audio_write(stem_path, wav, **self.xp.cfg.generate.audio)
+        return audio_path
+
+    def add_sample(self, sample_wav: torch.Tensor, epoch: int, index: int = 0,
+                   conditions: tp.Optional[tp.Dict[str, str]] = None, prompt_wav: tp.Optional[torch.Tensor] = None,
+                   ground_truth_wav: tp.Optional[torch.Tensor] = None,
+                   generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; Sample:
+        &#34;&#34;&#34;Adds a single sample.
+        The sample is stored in the XP&#39;s sample output directory, under a corresponding epoch folder.
+        Each sample is assigned an id which is computed using the input data. In addition to the
+        sample itself, a json file containing associated metadata is stored next to it.
+
+        Args:
+            sample_wav (torch.Tensor): sample audio to store. Tensor of shape [channels, shape].
+            epoch (int): current training epoch.
+            index (int): helpful to differentiate samples from the same batch.
+            conditions (dict[str, str], optional): conditioning used during generation.
+            prompt_wav (torch.Tensor, optional): prompt used during generation. Tensor of shape [channels, shape].
+            ground_truth_wav (torch.Tensor, optional): reference audio where prompt was extracted from.
+                Tensor of shape [channels, shape].
+            generation_args (dict[str, any], optional): dictionary of other arguments used during generation.
+        Returns:
+            Sample: The saved sample.
+        &#34;&#34;&#34;
+        sample_id = self._get_sample_id(index, prompt_wav, conditions)
+        reuse_id = self.map_reference_to_sample_id
+        prompt, ground_truth = None, None
+        if prompt_wav is not None:
+            prompt_id = sample_id if reuse_id else self._get_tensor_id(prompt_wav.sum(0, keepdim=True))
+            prompt_duration = prompt_wav.shape[-1] / self.xp.cfg.sample_rate
+            prompt_path = self._store_audio(prompt_wav, self.base_folder / str(epoch) / &#39;prompt&#39; / prompt_id)
+            prompt = ReferenceSample(prompt_id, str(prompt_path), prompt_duration)
+        if ground_truth_wav is not None:
+            ground_truth_id = sample_id if reuse_id else self._get_tensor_id(ground_truth_wav.sum(0, keepdim=True))
+            ground_truth_duration = ground_truth_wav.shape[-1] / self.xp.cfg.sample_rate
+            ground_truth_path = self._store_audio(ground_truth_wav, self.base_folder / &#39;reference&#39; / ground_truth_id)
+            ground_truth = ReferenceSample(ground_truth_id, str(ground_truth_path), ground_truth_duration)
+        sample_path = self._store_audio(sample_wav, self.base_folder / str(epoch) / sample_id, overwrite=True)
+        duration = sample_wav.shape[-1] / self.xp.cfg.sample_rate
+        sample = Sample(sample_id, str(sample_path), epoch, duration, conditions, prompt, ground_truth, generation_args)
+        self.samples.append(sample)
+        with open(sample_path.with_suffix(&#39;.json&#39;), &#39;w&#39;) as f:
+            json.dump(asdict(sample), f, indent=2)
+        return sample
+
+    def add_samples(self, samples_wavs: torch.Tensor, epoch: int,
+                    conditioning: tp.Optional[tp.List[tp.Dict[str, tp.Any]]] = None,
+                    prompt_wavs: tp.Optional[torch.Tensor] = None,
+                    ground_truth_wavs: tp.Optional[torch.Tensor] = None,
+                    generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; tp.List[Sample]:
+        &#34;&#34;&#34;Adds a batch of samples.
+        The samples are stored in the XP&#39;s sample output directory, under a corresponding
+        epoch folder. Each sample is assigned an id which is computed using the input data and their batch index.
+        In addition to the sample itself, a json file containing associated metadata is stored next to it.
+
+        Args:
+            sample_wavs (torch.Tensor): Batch of audio wavs to store. Tensor of shape [batch_size, channels, shape].
+            epoch (int): Current training epoch.
+            conditioning (list of dict[str, str], optional): List of conditions used during generation,
+                one per sample in the batch.
+            prompt_wavs (torch.Tensor, optional): Prompts used during generation. Tensor of shape
+                [batch_size, channels, shape].
+            ground_truth_wav (torch.Tensor, optional): Reference audio where prompts were extracted from.
+                Tensor of shape [batch_size, channels, shape].
+            generation_args (dict[str, Any], optional): Dictionary of other arguments used during generation.
+        Returns:
+            samples (list of Sample): The saved audio samples with prompts, ground truth and metadata.
+        &#34;&#34;&#34;
+        samples = []
+        for idx, wav in enumerate(samples_wavs):
+            prompt_wav = prompt_wavs[idx] if prompt_wavs is not None else None
+            gt_wav = ground_truth_wavs[idx] if ground_truth_wavs is not None else None
+            conditions = conditioning[idx] if conditioning is not None else None
+            samples.append(self.add_sample(wav, epoch, idx, conditions, prompt_wav, gt_wav, generation_args))
+        return samples
+
+    def get_samples(self, epoch: int = -1, max_epoch: int = -1, exclude_prompted: bool = False,
+                    exclude_unprompted: bool = False, exclude_conditioned: bool = False,
+                    exclude_unconditioned: bool = False) -&gt; tp.Set[Sample]:
+        &#34;&#34;&#34;Returns a set of samples for this XP. Optionally, you can filter which samples to obtain.
+        Please note that existing samples are loaded during the manager&#39;s initialization, and added samples through this
+        manager are also tracked. Any other external changes are not tracked automatically, so creating a new manager
+        is the only way detect them.
+
+        Args:
+            epoch (int): If provided, only return samples corresponding to this epoch.
+            max_epoch (int): If provided, only return samples corresponding to the latest epoch that is &lt;= max_epoch.
+            exclude_prompted (bool): If True, does not include samples that used a prompt.
+            exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
+            exclude_conditioned (bool): If True, excludes samples that used conditioning.
+            exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
+        Returns:
+            Samples (set of Sample): The retrieved samples matching the provided filters.
+        &#34;&#34;&#34;
+        if max_epoch &gt;= 0:
+            samples_epoch = max(sample.epoch for sample in self.samples if sample.epoch &lt;= max_epoch)
+        else:
+            samples_epoch = self.latest_epoch if epoch &lt; 0 else epoch
+        samples = {
+            sample
+            for sample in self.samples
+            if (
+                (sample.epoch == samples_epoch) and
+                (not exclude_prompted or sample.prompt is None) and
+                (not exclude_unprompted or sample.prompt is not None) and
+                (not exclude_conditioned or not sample.conditioning) and
+                (not exclude_unconditioned or sample.conditioning)
+            )
+        }
+        return samples</code></pre>
+</details>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.utils.samples.manager.SampleManager.latest_epoch"><code class="name">var <span class="ident">latest_epoch</span></code></dt>
+<dd>
+<div class="desc"><p>Latest epoch across all samples.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def latest_epoch(self):
+    &#34;&#34;&#34;Latest epoch across all samples.&#34;&#34;&#34;
+    return max(self.samples, key=lambda x: x.epoch).epoch if self.samples else 0</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.samples.manager.SampleManager.add_sample"><code class="name flex">
+<span>def <span class="ident">add_sample</span></span>(<span>self, sample_wav: torch.Tensor, epoch: int, index: int = 0, conditions: Optional[Dict[str, str]] = None, prompt_wav: Optional[torch.Tensor] = None, ground_truth_wav: Optional[torch.Tensor] = None, generation_args: Optional[Dict[str, Any]] = None) ‑> <a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Adds a single sample.
+The sample is stored in the XP's sample output directory, under a corresponding epoch folder.
+Each sample is assigned an id which is computed using the input data. In addition to the
+sample itself, a json file containing associated metadata is stored next to it.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>sample audio to store. Tensor of shape [channels, shape].</dd>
+<dt><strong><code>epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>current training epoch.</dd>
+<dt><strong><code>index</code></strong> :&ensp;<code>int</code></dt>
+<dd>helpful to differentiate samples from the same batch.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>dict[str, str]</code>, optional</dt>
+<dd>conditioning used during generation.</dd>
+<dt><strong><code>prompt_wav</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>prompt used during generation. Tensor of shape [channels, shape].</dd>
+<dt><strong><code>ground_truth_wav</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>reference audio where prompt was extracted from.
+Tensor of shape [channels, shape].</dd>
+<dt><strong><code>generation_args</code></strong> :&ensp;<code>dict[str, any]</code>, optional</dt>
+<dd>dictionary of other arguments used during generation.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a></code></dt>
+<dd>The saved sample.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def add_sample(self, sample_wav: torch.Tensor, epoch: int, index: int = 0,
+               conditions: tp.Optional[tp.Dict[str, str]] = None, prompt_wav: tp.Optional[torch.Tensor] = None,
+               ground_truth_wav: tp.Optional[torch.Tensor] = None,
+               generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; Sample:
+    &#34;&#34;&#34;Adds a single sample.
+    The sample is stored in the XP&#39;s sample output directory, under a corresponding epoch folder.
+    Each sample is assigned an id which is computed using the input data. In addition to the
+    sample itself, a json file containing associated metadata is stored next to it.
+
+    Args:
+        sample_wav (torch.Tensor): sample audio to store. Tensor of shape [channels, shape].
+        epoch (int): current training epoch.
+        index (int): helpful to differentiate samples from the same batch.
+        conditions (dict[str, str], optional): conditioning used during generation.
+        prompt_wav (torch.Tensor, optional): prompt used during generation. Tensor of shape [channels, shape].
+        ground_truth_wav (torch.Tensor, optional): reference audio where prompt was extracted from.
+            Tensor of shape [channels, shape].
+        generation_args (dict[str, any], optional): dictionary of other arguments used during generation.
+    Returns:
+        Sample: The saved sample.
+    &#34;&#34;&#34;
+    sample_id = self._get_sample_id(index, prompt_wav, conditions)
+    reuse_id = self.map_reference_to_sample_id
+    prompt, ground_truth = None, None
+    if prompt_wav is not None:
+        prompt_id = sample_id if reuse_id else self._get_tensor_id(prompt_wav.sum(0, keepdim=True))
+        prompt_duration = prompt_wav.shape[-1] / self.xp.cfg.sample_rate
+        prompt_path = self._store_audio(prompt_wav, self.base_folder / str(epoch) / &#39;prompt&#39; / prompt_id)
+        prompt = ReferenceSample(prompt_id, str(prompt_path), prompt_duration)
+    if ground_truth_wav is not None:
+        ground_truth_id = sample_id if reuse_id else self._get_tensor_id(ground_truth_wav.sum(0, keepdim=True))
+        ground_truth_duration = ground_truth_wav.shape[-1] / self.xp.cfg.sample_rate
+        ground_truth_path = self._store_audio(ground_truth_wav, self.base_folder / &#39;reference&#39; / ground_truth_id)
+        ground_truth = ReferenceSample(ground_truth_id, str(ground_truth_path), ground_truth_duration)
+    sample_path = self._store_audio(sample_wav, self.base_folder / str(epoch) / sample_id, overwrite=True)
+    duration = sample_wav.shape[-1] / self.xp.cfg.sample_rate
+    sample = Sample(sample_id, str(sample_path), epoch, duration, conditions, prompt, ground_truth, generation_args)
+    self.samples.append(sample)
+    with open(sample_path.with_suffix(&#39;.json&#39;), &#39;w&#39;) as f:
+        json.dump(asdict(sample), f, indent=2)
+    return sample</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.samples.manager.SampleManager.add_samples"><code class="name flex">
+<span>def <span class="ident">add_samples</span></span>(<span>self, samples_wavs: torch.Tensor, epoch: int, conditioning: Optional[List[Dict[str, Any]]] = None, prompt_wavs: Optional[torch.Tensor] = None, ground_truth_wavs: Optional[torch.Tensor] = None, generation_args: Optional[Dict[str, Any]] = None) ‑> List[<a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Adds a batch of samples.
+The samples are stored in the XP's sample output directory, under a corresponding
+epoch folder. Each sample is assigned an id which is computed using the input data and their batch index.
+In addition to the sample itself, a json file containing associated metadata is stored next to it.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_wavs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Batch of audio wavs to store. Tensor of shape [batch_size, channels, shape].</dd>
+<dt><strong><code>epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>Current training epoch.</dd>
+<dt><strong><code>conditioning</code></strong> :&ensp;<code>list</code> of <code>dict[str, str]</code>, optional</dt>
+<dd>List of conditions used during generation,
+one per sample in the batch.</dd>
+<dt><strong><code>prompt_wavs</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>Prompts used during generation. Tensor of shape
+[batch_size, channels, shape].</dd>
+<dt><strong><code>ground_truth_wav</code></strong> :&ensp;<code>torch.Tensor</code>, optional</dt>
+<dd>Reference audio where prompts were extracted from.
+Tensor of shape [batch_size, channels, shape].</dd>
+<dt><strong><code>generation_args</code></strong> :&ensp;<code>dict[str, Any]</code>, optional</dt>
+<dd>Dictionary of other arguments used during generation.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>samples (list of Sample): The saved audio samples with prompts, ground truth and metadata.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def add_samples(self, samples_wavs: torch.Tensor, epoch: int,
+                conditioning: tp.Optional[tp.List[tp.Dict[str, tp.Any]]] = None,
+                prompt_wavs: tp.Optional[torch.Tensor] = None,
+                ground_truth_wavs: tp.Optional[torch.Tensor] = None,
+                generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -&gt; tp.List[Sample]:
+    &#34;&#34;&#34;Adds a batch of samples.
+    The samples are stored in the XP&#39;s sample output directory, under a corresponding
+    epoch folder. Each sample is assigned an id which is computed using the input data and their batch index.
+    In addition to the sample itself, a json file containing associated metadata is stored next to it.
+
+    Args:
+        sample_wavs (torch.Tensor): Batch of audio wavs to store. Tensor of shape [batch_size, channels, shape].
+        epoch (int): Current training epoch.
+        conditioning (list of dict[str, str], optional): List of conditions used during generation,
+            one per sample in the batch.
+        prompt_wavs (torch.Tensor, optional): Prompts used during generation. Tensor of shape
+            [batch_size, channels, shape].
+        ground_truth_wav (torch.Tensor, optional): Reference audio where prompts were extracted from.
+            Tensor of shape [batch_size, channels, shape].
+        generation_args (dict[str, Any], optional): Dictionary of other arguments used during generation.
+    Returns:
+        samples (list of Sample): The saved audio samples with prompts, ground truth and metadata.
+    &#34;&#34;&#34;
+    samples = []
+    for idx, wav in enumerate(samples_wavs):
+        prompt_wav = prompt_wavs[idx] if prompt_wavs is not None else None
+        gt_wav = ground_truth_wavs[idx] if ground_truth_wavs is not None else None
+        conditions = conditioning[idx] if conditioning is not None else None
+        samples.append(self.add_sample(wav, epoch, idx, conditions, prompt_wav, gt_wav, generation_args))
+    return samples</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.samples.manager.SampleManager.get_samples"><code class="name flex">
+<span>def <span class="ident">get_samples</span></span>(<span>self, epoch: int = -1, max_epoch: int = -1, exclude_prompted: bool = False, exclude_unprompted: bool = False, exclude_conditioned: bool = False, exclude_unconditioned: bool = False) ‑> Set[<a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Returns a set of samples for this XP. Optionally, you can filter which samples to obtain.
+Please note that existing samples are loaded during the manager's initialization, and added samples through this
+manager are also tracked. Any other external changes are not tracked automatically, so creating a new manager
+is the only way detect them.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>If provided, only return samples corresponding to this epoch.</dd>
+<dt><strong><code>max_epoch</code></strong> :&ensp;<code>int</code></dt>
+<dd>If provided, only return samples corresponding to the latest epoch that is &lt;= max_epoch.</dd>
+<dt><strong><code>exclude_prompted</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, does not include samples that used a prompt.</dd>
+<dt><strong><code>exclude_unprompted</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, does not include samples that did not use a prompt.</dd>
+<dt><strong><code>exclude_conditioned</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, excludes samples that used conditioning.</dd>
+<dt><strong><code>exclude_unconditioned</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, excludes samples that did not use conditioning.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>Samples (set of Sample): The retrieved samples matching the provided filters.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_samples(self, epoch: int = -1, max_epoch: int = -1, exclude_prompted: bool = False,
+                exclude_unprompted: bool = False, exclude_conditioned: bool = False,
+                exclude_unconditioned: bool = False) -&gt; tp.Set[Sample]:
+    &#34;&#34;&#34;Returns a set of samples for this XP. Optionally, you can filter which samples to obtain.
+    Please note that existing samples are loaded during the manager&#39;s initialization, and added samples through this
+    manager are also tracked. Any other external changes are not tracked automatically, so creating a new manager
+    is the only way detect them.
+
+    Args:
+        epoch (int): If provided, only return samples corresponding to this epoch.
+        max_epoch (int): If provided, only return samples corresponding to the latest epoch that is &lt;= max_epoch.
+        exclude_prompted (bool): If True, does not include samples that used a prompt.
+        exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
+        exclude_conditioned (bool): If True, excludes samples that used conditioning.
+        exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
+    Returns:
+        Samples (set of Sample): The retrieved samples matching the provided filters.
+    &#34;&#34;&#34;
+    if max_epoch &gt;= 0:
+        samples_epoch = max(sample.epoch for sample in self.samples if sample.epoch &lt;= max_epoch)
+    else:
+        samples_epoch = self.latest_epoch if epoch &lt; 0 else epoch
+    samples = {
+        sample
+        for sample in self.samples
+        if (
+            (sample.epoch == samples_epoch) and
+            (not exclude_prompted or sample.prompt is None) and
+            (not exclude_unprompted or sample.prompt is not None) and
+            (not exclude_conditioned or not sample.conditioning) and
+            (not exclude_unconditioned or sample.conditioning)
+        )
+    }
+    return samples</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils.samples" href="index.html">audiocraft.utils.samples</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.samples.manager.get_samples_for_xps" href="#audiocraft.utils.samples.manager.get_samples_for_xps">get_samples_for_xps</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.slugify" href="#audiocraft.utils.samples.manager.slugify">slugify</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.samples.manager.ReferenceSample" href="#audiocraft.utils.samples.manager.ReferenceSample">ReferenceSample</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.samples.manager.ReferenceSample.duration" href="#audiocraft.utils.samples.manager.ReferenceSample.duration">duration</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.ReferenceSample.id" href="#audiocraft.utils.samples.manager.ReferenceSample.id">id</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.ReferenceSample.path" href="#audiocraft.utils.samples.manager.ReferenceSample.path">path</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.utils.samples.manager.Sample" href="#audiocraft.utils.samples.manager.Sample">Sample</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.utils.samples.manager.Sample.audio" href="#audiocraft.utils.samples.manager.Sample.audio">audio</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.audio_prompt" href="#audiocraft.utils.samples.manager.Sample.audio_prompt">audio_prompt</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.audio_reference" href="#audiocraft.utils.samples.manager.Sample.audio_reference">audio_reference</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.conditioning" href="#audiocraft.utils.samples.manager.Sample.conditioning">conditioning</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.duration" href="#audiocraft.utils.samples.manager.Sample.duration">duration</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.epoch" href="#audiocraft.utils.samples.manager.Sample.epoch">epoch</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.generation_args" href="#audiocraft.utils.samples.manager.Sample.generation_args">generation_args</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.id" href="#audiocraft.utils.samples.manager.Sample.id">id</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.path" href="#audiocraft.utils.samples.manager.Sample.path">path</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.prompt" href="#audiocraft.utils.samples.manager.Sample.prompt">prompt</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.Sample.reference" href="#audiocraft.utils.samples.manager.Sample.reference">reference</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.utils.samples.manager.SampleManager" href="#audiocraft.utils.samples.manager.SampleManager">SampleManager</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.samples.manager.SampleManager.add_sample" href="#audiocraft.utils.samples.manager.SampleManager.add_sample">add_sample</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.SampleManager.add_samples" href="#audiocraft.utils.samples.manager.SampleManager.add_samples">add_samples</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.SampleManager.get_samples" href="#audiocraft.utils.samples.manager.SampleManager.get_samples">get_samples</a></code></li>
+<li><code><a title="audiocraft.utils.samples.manager.SampleManager.latest_epoch" href="#audiocraft.utils.samples.manager.SampleManager.latest_epoch">latest_epoch</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/api_docs/audiocraft/utils/utils.html b/api_docs/audiocraft/utils/utils.html
new file mode 100644
index 00000000..27f0645a
--- /dev/null
+++ b/api_docs/audiocraft/utils/utils.html
@@ -0,0 +1,983 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.utils API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.utils</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import contextmanager
+from functools import wraps, lru_cache
+import hashlib
+import json
+import logging
+from pathlib import Path
+import typing as tp
+
+import flashy
+import flashy.distrib
+import omegaconf
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+
+logger = logging.getLogger(__name__)
+
+
+def model_hash(model: torch.nn.Module) -&gt; str:
+    &#34;&#34;&#34;Return a model hash. This should allow us to track regressions in model init
+    from the logs of past experiments.
+    &#34;&#34;&#34;
+    hasher = hashlib.sha1()
+    for p in model.parameters():
+        hasher.update(p.data.cpu().numpy().tobytes())
+    return hasher.hexdigest()
+
+
+def dict_from_config(cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Convenience function to map an omegaconf configuration to a dictionary.
+
+    Args:
+        cfg (omegaconf.DictConfig): Original configuration to map to dict.
+    Returns:
+        dict: Config as dictionary object.
+    &#34;&#34;&#34;
+    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
+    assert isinstance(dct, dict)
+    return dct
+
+
+def random_subset(dataset, max_samples: int, seed: int = 42) -&gt; torch.utils.data.Subset:
+    if max_samples &gt;= len(dataset):
+        return dataset
+
+    generator = torch.Generator().manual_seed(seed)
+    perm = torch.randperm(len(dataset), generator=generator)
+    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
+
+
+def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
+               num_workers: int, seed: int, **kwargs) -&gt; torch.utils.data.DataLoader:
+    &#34;&#34;&#34;Convenience function to load dataset into a dataloader with optional subset sampling.
+
+    Args:
+        dataset: Dataset to load.
+        num_samples (Optional[int]): Number of samples to limit subset size.
+        batch_size (int): Batch size.
+        num_workers (int): Number of workers for data loading.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    if num_samples is not None:
+        dataset = random_subset(dataset, num_samples, seed)
+
+    dataloader = flashy.distrib.loader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **kwargs
+    )
+    return dataloader
+
+
+def get_dataset_from_loader(dataloader):
+    dataset = dataloader.dataset
+    if isinstance(dataset, torch.utils.data.Subset):
+        return dataset.dataset
+    else:
+        return dataset
+
+
+def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
+    &#34;&#34;&#34;torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    &#34;&#34;&#34;
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+
+
+def sample_top_k(probs: torch.Tensor, k: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top K values along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    top_k_value, _ = torch.topk(probs, k, dim=-1)
+    min_value_top_k = top_k_value[..., [-1]]
+    probs *= (probs &gt;= min_value_top_k).float()
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs, num_samples=1)
+    return next_token
+
+
+def sample_top_p(probs: torch.Tensor, p: float) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top P probabilities along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort &gt; p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+
+
+class DummyPoolExecutor:
+    &#34;&#34;&#34;Dummy pool executor to use when we actually have only 1 worker.
+    (e.g. instead of ProcessPoolExecutor).
+    &#34;&#34;&#34;
+    class DummyResult:
+        def __init__(self, func, *args, **kwargs):
+            self.func = func
+            self.args = args
+            self.kwargs = kwargs
+
+        def result(self):
+            return self.func(*self.args, **self.kwargs)
+
+    def __init__(self, workers, mp_context=None):
+        pass
+
+    def submit(self, func, *args, **kwargs):
+        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        return
+
+
+def get_pool_executor(num_workers: int, mp_context=None):
+    return ProcessPoolExecutor(num_workers, mp_context) if num_workers &gt; 1 else DummyPoolExecutor(1)
+
+
+def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+    For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
+
+    Args:
+        lengths (torch.Tensor): tensor with lengths
+        max_len (int): can set the max length manually. Defaults to None.
+    Returns:
+        torch.Tensor: mask with 0s where there is pad tokens else 1s
+    &#34;&#34;&#34;
+    assert len(lengths.shape) == 1, &#34;Length shape should be 1 dimensional.&#34;
+    final_length = lengths.max().item() if not max_len else max_len
+    final_length = max(final_length, 1)  # if all seqs are of len zero we don&#39;t want a zero-size tensor
+    return torch.arange(final_length, device=lengths.device)[None, :] &lt; lengths[:, None]
+
+
+def hash_trick(word: str, vocab_size: int) -&gt; int:
+    &#34;&#34;&#34;Hash trick to pair each word with an index
+
+    Args:
+        word (str): word we wish to convert to an index
+        vocab_size (int): size of the vocabulary
+    Returns:
+        int: index of the word in the embedding LUT
+    &#34;&#34;&#34;
+    hash = int(hashlib.sha256(word.encode(&#34;utf-8&#34;)).hexdigest(), 16)
+    return hash % vocab_size
+
+
+def with_rank_rng(base_seed: int = 1234):
+    &#34;&#34;&#34;Decorator for a function so that the function will use a Random Number Generator
+    whose state depend on the GPU rank. The original RNG state is restored upon returning.
+
+    Args:
+        base_seed (int): Random seed.
+    &#34;&#34;&#34;
+    def _decorator(fun: tp.Callable):
+        @wraps(fun)
+        def _decorated(*args, **kwargs):
+            state = torch.get_rng_state()
+            seed = base_seed ^ flashy.distrib.rank()
+            torch.manual_seed(seed)
+            logger.debug(&#39;Rank dependent seed set to %d&#39;, seed)
+            try:
+                return fun(*args, **kwargs)
+            finally:
+                torch.set_rng_state(state)
+                logger.debug(&#39;RNG state restored.&#39;)
+        return _decorated
+    return _decorator
+
+
+def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Get a list of tensors and collate them to a single tensor. according to the following logic:
+    - `dim` specifies the time dimension which will be stacked and padded.
+    - The output will contain 1 new dimension (dimension index 0) which will be the size of
+    of the original list.
+
+    Args:
+        tensors (tp.List[torch.Tensor]): List of tensors to collate.
+        dim (int): Dimension which will be stacked and padded.
+    Returns:
+        tp.Tuple[torch.Tensor, torch.Tensor]:
+            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
+                (dimension index 0) which will be the size of the original list.
+            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
+    &#34;&#34;&#34;
+    tensors = [x.transpose(0, dim) for x in tensors]
+    lens = torch.LongTensor([len(x) for x in tensors])
+    padded_tensors = pad_sequence(tensors)
+    padded_tensors = padded_tensors.transpose(0, 1)
+    padded_tensors = padded_tensors.transpose(1, dim + 1)
+    return padded_tensors, lens
+
+
+# TODO: Move to flashy?
+def copy_state(state: tp.Any, device: tp.Union[torch.device, str] = &#39;cpu&#39;,
+               dtype: tp.Optional[torch.dtype] = None) -&gt; tp.Any:
+    if isinstance(state, torch.Tensor):
+        if dtype is None or not state.is_floating_point():
+            dtype = state.dtype
+        return state.detach().to(device=device, dtype=dtype, copy=True)
+    elif isinstance(state, dict):
+        return {k: copy_state(v, device, dtype) for k, v in state.items()}
+    elif isinstance(state, list):
+        return [copy_state(v, device, dtype) for v in state]
+
+
+# TODO: Move to flashy?
+@contextmanager
+def swap_state(model, state, **kwargs):
+    old_state = copy_state(model.state_dict())
+    model.load_state_dict(state, **kwargs)
+    try:
+        yield
+    finally:
+        model.load_state_dict(old_state)
+
+
+@lru_cache(None)
+def warn_once(logger, msg):
+    &#34;&#34;&#34;Warn about a given message only once.&#34;&#34;&#34;
+    logger.warning(msg)
+
+
+def is_jsonable(x: tp.Any):
+    &#34;&#34;&#34;Check if an object can be serialized into a json:&#34;&#34;&#34;
+    try:
+        json.dumps(x)
+        return True
+    except (TypeError, OverflowError):
+        return False
+
+
+def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
+    &#34;&#34;&#34;Wrapper around state dict loading of CLAP model
+    addressing compatibility issues between CLAP and AudioCraft
+    HuggingFace transformer version.
+    See: https://github.com/LAION-AI/CLAP/issues/118
+    &#34;&#34;&#34;
+    from clap_module.factory import load_state_dict  # type: ignore
+    pkg = load_state_dict(path)
+    pkg.pop(&#39;text_branch.embeddings.position_ids&#39;, None)
+    clap_model.model.load_state_dict(pkg)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.utils.collate"><code class="name flex">
+<span>def <span class="ident">collate</span></span>(<span>tensors: List[torch.Tensor], dim: int = 0) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get a list of tensors and collate them to a single tensor. according to the following logic:
+- <code>dim</code> specifies the time dimension which will be stacked and padded.
+- The output will contain 1 new dimension (dimension index 0) which will be the size of
+of the original list.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>tensors</code></strong> :&ensp;<code>tp.List[torch.Tensor]</code></dt>
+<dd>List of tensors to collate.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension which will be stacked and padded.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt>tp.Tuple[torch.Tensor, torch.Tensor]:</dt>
+<dt><code>
+torch.Tensor</code></dt>
+<dd>Stacked and padded tensor. The output will contain 1 new dimension
+(dimension index 0) which will be the size of the original list.
+torch.Tensor: Tensor containing length of original tensor sizes (without padding).</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Get a list of tensors and collate them to a single tensor. according to the following logic:
+    - `dim` specifies the time dimension which will be stacked and padded.
+    - The output will contain 1 new dimension (dimension index 0) which will be the size of
+    of the original list.
+
+    Args:
+        tensors (tp.List[torch.Tensor]): List of tensors to collate.
+        dim (int): Dimension which will be stacked and padded.
+    Returns:
+        tp.Tuple[torch.Tensor, torch.Tensor]:
+            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
+                (dimension index 0) which will be the size of the original list.
+            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
+    &#34;&#34;&#34;
+    tensors = [x.transpose(0, dim) for x in tensors]
+    lens = torch.LongTensor([len(x) for x in tensors])
+    padded_tensors = pad_sequence(tensors)
+    padded_tensors = padded_tensors.transpose(0, 1)
+    padded_tensors = padded_tensors.transpose(1, dim + 1)
+    return padded_tensors, lens</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.copy_state"><code class="name flex">
+<span>def <span class="ident">copy_state</span></span>(<span>state: Any, device: Union[torch.device, str] = 'cpu', dtype: Optional[torch.dtype] = None) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def copy_state(state: tp.Any, device: tp.Union[torch.device, str] = &#39;cpu&#39;,
+               dtype: tp.Optional[torch.dtype] = None) -&gt; tp.Any:
+    if isinstance(state, torch.Tensor):
+        if dtype is None or not state.is_floating_point():
+            dtype = state.dtype
+        return state.detach().to(device=device, dtype=dtype, copy=True)
+    elif isinstance(state, dict):
+        return {k: copy_state(v, device, dtype) for k, v in state.items()}
+    elif isinstance(state, list):
+        return [copy_state(v, device, dtype) for v in state]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.dict_from_config"><code class="name flex">
+<span>def <span class="ident">dict_from_config</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function to map an omegaconf configuration to a dictionary.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>omegaconf.DictConfig</code></dt>
+<dd>Original configuration to map to dict.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>dict</code></dt>
+<dd>Config as dictionary object.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dict_from_config(cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Convenience function to map an omegaconf configuration to a dictionary.
+
+    Args:
+        cfg (omegaconf.DictConfig): Original configuration to map to dict.
+    Returns:
+        dict: Config as dictionary object.
+    &#34;&#34;&#34;
+    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
+    assert isinstance(dct, dict)
+    return dct</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_dataset_from_loader"><code class="name flex">
+<span>def <span class="ident">get_dataset_from_loader</span></span>(<span>dataloader)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_dataset_from_loader(dataloader):
+    dataset = dataloader.dataset
+    if isinstance(dataset, torch.utils.data.Subset):
+        return dataset.dataset
+    else:
+        return dataset</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_loader"><code class="name flex">
+<span>def <span class="ident">get_loader</span></span>(<span>dataset, num_samples: Optional[int], batch_size: int, num_workers: int, seed: int, **kwargs) ‑> torch.utils.data.dataloader.DataLoader</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function to load dataset into a dataloader with optional subset sampling.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dataset</code></strong></dt>
+<dd>Dataset to load.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>Optional[int]</code></dt>
+<dd>Number of samples to limit subset size.</dd>
+<dt><strong><code>batch_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Batch size.</dd>
+<dt><strong><code>num_workers</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of workers for data loading.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
+               num_workers: int, seed: int, **kwargs) -&gt; torch.utils.data.DataLoader:
+    &#34;&#34;&#34;Convenience function to load dataset into a dataloader with optional subset sampling.
+
+    Args:
+        dataset: Dataset to load.
+        num_samples (Optional[int]): Number of samples to limit subset size.
+        batch_size (int): Batch size.
+        num_workers (int): Number of workers for data loading.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    if num_samples is not None:
+        dataset = random_subset(dataset, num_samples, seed)
+
+    dataloader = flashy.distrib.loader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **kwargs
+    )
+    return dataloader</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_pool_executor"><code class="name flex">
+<span>def <span class="ident">get_pool_executor</span></span>(<span>num_workers: int, mp_context=None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_pool_executor(num_workers: int, mp_context=None):
+    return ProcessPoolExecutor(num_workers, mp_context) if num_workers &gt; 1 else DummyPoolExecutor(1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.hash_trick"><code class="name flex">
+<span>def <span class="ident">hash_trick</span></span>(<span>word: str, vocab_size: int) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Hash trick to pair each word with an index</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>word</code></strong> :&ensp;<code>str</code></dt>
+<dd>word we wish to convert to an index</dd>
+<dt><strong><code>vocab_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>size of the vocabulary</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>int</code></dt>
+<dd>index of the word in the embedding LUT</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hash_trick(word: str, vocab_size: int) -&gt; int:
+    &#34;&#34;&#34;Hash trick to pair each word with an index
+
+    Args:
+        word (str): word we wish to convert to an index
+        vocab_size (int): size of the vocabulary
+    Returns:
+        int: index of the word in the embedding LUT
+    &#34;&#34;&#34;
+    hash = int(hashlib.sha256(word.encode(&#34;utf-8&#34;)).hexdigest(), 16)
+    return hash % vocab_size</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.is_jsonable"><code class="name flex">
+<span>def <span class="ident">is_jsonable</span></span>(<span>x: Any)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Check if an object can be serialized into a json:</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def is_jsonable(x: tp.Any):
+    &#34;&#34;&#34;Check if an object can be serialized into a json:&#34;&#34;&#34;
+    try:
+        json.dumps(x)
+        return True
+    except (TypeError, OverflowError):
+        return False</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.length_to_mask"><code class="name flex">
+<span>def <span class="ident">length_to_mask</span></span>(<span>lengths: torch.Tensor, max_len: Optional[int] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>lengths</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>tensor with lengths</dd>
+<dt><strong><code>max_len</code></strong> :&ensp;<code>int</code></dt>
+<dd>can set the max length manually. Defaults to None.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>mask with 0s where there is pad tokens else 1s</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+    For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
+
+    Args:
+        lengths (torch.Tensor): tensor with lengths
+        max_len (int): can set the max length manually. Defaults to None.
+    Returns:
+        torch.Tensor: mask with 0s where there is pad tokens else 1s
+    &#34;&#34;&#34;
+    assert len(lengths.shape) == 1, &#34;Length shape should be 1 dimensional.&#34;
+    final_length = lengths.max().item() if not max_len else max_len
+    final_length = max(final_length, 1)  # if all seqs are of len zero we don&#39;t want a zero-size tensor
+    return torch.arange(final_length, device=lengths.device)[None, :] &lt; lengths[:, None]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.load_clap_state_dict"><code class="name flex">
+<span>def <span class="ident">load_clap_state_dict</span></span>(<span>clap_model, path: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around state dict loading of CLAP model
+addressing compatibility issues between CLAP and AudioCraft
+HuggingFace transformer version.
+See: <a href="https://github.com/LAION-AI/CLAP/issues/118">https://github.com/LAION-AI/CLAP/issues/118</a></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
+    &#34;&#34;&#34;Wrapper around state dict loading of CLAP model
+    addressing compatibility issues between CLAP and AudioCraft
+    HuggingFace transformer version.
+    See: https://github.com/LAION-AI/CLAP/issues/118
+    &#34;&#34;&#34;
+    from clap_module.factory import load_state_dict  # type: ignore
+    pkg = load_state_dict(path)
+    pkg.pop(&#39;text_branch.embeddings.position_ids&#39;, None)
+    clap_model.model.load_state_dict(pkg)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.model_hash"><code class="name flex">
+<span>def <span class="ident">model_hash</span></span>(<span>model: torch.nn.modules.module.Module) ‑> str</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return a model hash. This should allow us to track regressions in model init
+from the logs of past experiments.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def model_hash(model: torch.nn.Module) -&gt; str:
+    &#34;&#34;&#34;Return a model hash. This should allow us to track regressions in model init
+    from the logs of past experiments.
+    &#34;&#34;&#34;
+    hasher = hashlib.sha1()
+    for p in model.parameters():
+        hasher.update(p.data.cpu().numpy().tobytes())
+    return hasher.hexdigest()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.multinomial"><code class="name flex">
+<span>def <span class="ident">multinomial</span></span>(<span>input: torch.Tensor, num_samples: int, replacement=False, *, generator=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>input</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>The input tensor containing probabilities.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of samples to draw.</dd>
+<dt><strong><code>replacement</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to draw with replacement or not.</dd>
+</dl>
+<p>Keywords args:
+generator (torch.Generator): A pseudorandom number generator for sampling.</p>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Last dimension contains num_samples indices
+sampled from the multinomial probability distribution
+located in the last dimension of tensor input.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
+    &#34;&#34;&#34;torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    &#34;&#34;&#34;
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.random_subset"><code class="name flex">
+<span>def <span class="ident">random_subset</span></span>(<span>dataset, max_samples: int, seed: int = 42) ‑> torch.utils.data.dataset.Subset</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def random_subset(dataset, max_samples: int, seed: int = 42) -&gt; torch.utils.data.Subset:
+    if max_samples &gt;= len(dataset):
+        return dataset
+
+    generator = torch.Generator().manual_seed(seed)
+    perm = torch.randperm(len(dataset), generator=generator)
+    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.sample_top_k"><code class="name flex">
+<span>def <span class="ident">sample_top_k</span></span>(<span>probs: torch.Tensor, k: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample next token from top K values along the last dimension of the input probs tensor.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input probabilities with token candidates on the last dimension.</dd>
+<dt><strong><code>k</code></strong> :&ensp;<code>int</code></dt>
+<dd>The k in “top-k”.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sampled tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_top_k(probs: torch.Tensor, k: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top K values along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    top_k_value, _ = torch.topk(probs, k, dim=-1)
+    min_value_top_k = top_k_value[..., [-1]]
+    probs *= (probs &gt;= min_value_top_k).float()
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs, num_samples=1)
+    return next_token</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.sample_top_p"><code class="name flex">
+<span>def <span class="ident">sample_top_p</span></span>(<span>probs: torch.Tensor, p: float) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample next token from top P probabilities along the last dimension of the input probs tensor.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input probabilities with token candidates on the last dimension.</dd>
+<dt><strong><code>p</code></strong> :&ensp;<code>int</code></dt>
+<dd>The p in “top-p”.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sampled tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_top_p(probs: torch.Tensor, p: float) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top P probabilities along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort &gt; p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.swap_state"><code class="name flex">
+<span>def <span class="ident">swap_state</span></span>(<span>model, state, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def swap_state(model, state, **kwargs):
+    old_state = copy_state(model.state_dict())
+    model.load_state_dict(state, **kwargs)
+    try:
+        yield
+    finally:
+        model.load_state_dict(old_state)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.warn_once"><code class="name flex">
+<span>def <span class="ident">warn_once</span></span>(<span>logger, msg)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Warn about a given message only once.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@lru_cache(None)
+def warn_once(logger, msg):
+    &#34;&#34;&#34;Warn about a given message only once.&#34;&#34;&#34;
+    logger.warning(msg)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.with_rank_rng"><code class="name flex">
+<span>def <span class="ident">with_rank_rng</span></span>(<span>base_seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decorator for a function so that the function will use a Random Number Generator
+whose state depend on the GPU rank. The original RNG state is restored upon returning.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>base_seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def with_rank_rng(base_seed: int = 1234):
+    &#34;&#34;&#34;Decorator for a function so that the function will use a Random Number Generator
+    whose state depend on the GPU rank. The original RNG state is restored upon returning.
+
+    Args:
+        base_seed (int): Random seed.
+    &#34;&#34;&#34;
+    def _decorator(fun: tp.Callable):
+        @wraps(fun)
+        def _decorated(*args, **kwargs):
+            state = torch.get_rng_state()
+            seed = base_seed ^ flashy.distrib.rank()
+            torch.manual_seed(seed)
+            logger.debug(&#39;Rank dependent seed set to %d&#39;, seed)
+            try:
+                return fun(*args, **kwargs)
+            finally:
+                torch.set_rng_state(state)
+                logger.debug(&#39;RNG state restored.&#39;)
+        return _decorated
+    return _decorator</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor"><code class="flex name class">
+<span>class <span class="ident">DummyPoolExecutor</span></span>
+<span>(</span><span>workers, mp_context=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Dummy pool executor to use when we actually have only 1 worker.
+(e.g. instead of ProcessPoolExecutor).</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DummyPoolExecutor:
+    &#34;&#34;&#34;Dummy pool executor to use when we actually have only 1 worker.
+    (e.g. instead of ProcessPoolExecutor).
+    &#34;&#34;&#34;
+    class DummyResult:
+        def __init__(self, func, *args, **kwargs):
+            self.func = func
+            self.args = args
+            self.kwargs = kwargs
+
+        def result(self):
+            return self.func(*self.args, **self.kwargs)
+
+    def __init__(self, workers, mp_context=None):
+        pass
+
+    def submit(self, func, *args, **kwargs):
+        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        return</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor.DummyResult"><code class="name">var <span class="ident">DummyResult</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor.submit"><code class="name flex">
+<span>def <span class="ident">submit</span></span>(<span>self, func, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def submit(self, func, *args, **kwargs):
+    return DummyPoolExecutor.DummyResult(func, *args, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.utils.collate" href="#audiocraft.utils.utils.collate">collate</a></code></li>
+<li><code><a title="audiocraft.utils.utils.copy_state" href="#audiocraft.utils.utils.copy_state">copy_state</a></code></li>
+<li><code><a title="audiocraft.utils.utils.dict_from_config" href="#audiocraft.utils.utils.dict_from_config">dict_from_config</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_dataset_from_loader" href="#audiocraft.utils.utils.get_dataset_from_loader">get_dataset_from_loader</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_loader" href="#audiocraft.utils.utils.get_loader">get_loader</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_pool_executor" href="#audiocraft.utils.utils.get_pool_executor">get_pool_executor</a></code></li>
+<li><code><a title="audiocraft.utils.utils.hash_trick" href="#audiocraft.utils.utils.hash_trick">hash_trick</a></code></li>
+<li><code><a title="audiocraft.utils.utils.is_jsonable" href="#audiocraft.utils.utils.is_jsonable">is_jsonable</a></code></li>
+<li><code><a title="audiocraft.utils.utils.length_to_mask" href="#audiocraft.utils.utils.length_to_mask">length_to_mask</a></code></li>
+<li><code><a title="audiocraft.utils.utils.load_clap_state_dict" href="#audiocraft.utils.utils.load_clap_state_dict">load_clap_state_dict</a></code></li>
+<li><code><a title="audiocraft.utils.utils.model_hash" href="#audiocraft.utils.utils.model_hash">model_hash</a></code></li>
+<li><code><a title="audiocraft.utils.utils.multinomial" href="#audiocraft.utils.utils.multinomial">multinomial</a></code></li>
+<li><code><a title="audiocraft.utils.utils.random_subset" href="#audiocraft.utils.utils.random_subset">random_subset</a></code></li>
+<li><code><a title="audiocraft.utils.utils.sample_top_k" href="#audiocraft.utils.utils.sample_top_k">sample_top_k</a></code></li>
+<li><code><a title="audiocraft.utils.utils.sample_top_p" href="#audiocraft.utils.utils.sample_top_p">sample_top_p</a></code></li>
+<li><code><a title="audiocraft.utils.utils.swap_state" href="#audiocraft.utils.utils.swap_state">swap_state</a></code></li>
+<li><code><a title="audiocraft.utils.utils.warn_once" href="#audiocraft.utils.utils.warn_once">warn_once</a></code></li>
+<li><code><a title="audiocraft.utils.utils.with_rank_rng" href="#audiocraft.utils.utils.with_rank_rng">with_rank_rng</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.utils.DummyPoolExecutor" href="#audiocraft.utils.utils.DummyPoolExecutor">DummyPoolExecutor</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.utils.DummyPoolExecutor.DummyResult" href="#audiocraft.utils.utils.DummyPoolExecutor.DummyResult">DummyResult</a></code></li>
+<li><code><a title="audiocraft.utils.utils.DummyPoolExecutor.submit" href="#audiocraft.utils.utils.DummyPoolExecutor.submit">submit</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file