MTG · dbogdanov · Apr 9, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -113,8 +113,8 @@ void TensorflowPredictMAEST::configure() {
 
   if (parameter("patchSize").isConfigured()) {
     if (graphFilename.find("discogs-maest-20s-") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
-      patchSize = 1258;
+      E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1256, which is adequate for the 20s model.");
+      patchSize = 1256;
     } else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
       patchSize = 626;

diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.h b/src/algorithms/machinelearning/tensorflowpredictmaest.h
@@ -63,8 +63,8 @@ class TensorflowPredictMAEST : public AlgorithmComposite {
   void declareParameters() {
     declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
     declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
-    declareParameter("input", "the name of the input node in the TensorFlow graph", "", "serving_default_melspectrogram");
-    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
+    declareParameter("input", "the name of the input node in the TensorFlow graph", "", "melspectrogram");
+    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
     declareParameter("isTrainingName", "the name of an additional input node to indicate the model if it is in training mode or not. Leave it empty when the model does not need such input", "", "");
     declareParameter("patchHopSize", "the number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
     declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
@@ -116,8 +116,8 @@ class TensorflowPredictMAEST : public Algorithm {
   void declareParameters() {
     declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
     declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
-    declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
-    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
+    declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "melspectrogram");
+    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
     declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
     declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
     declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");

diff --git a/src/examples/python/models/generate_example_scripts.py b/src/examples/python/models/generate_example_scripts.py
@@ -12,9 +12,10 @@
     "TensorflowPredictMusiCNN": "model/Placeholder",
     "TensorflowPredictVGGish": "model/Placeholder",
     "TensorflowPredict2D": "model/Placeholder",
+    "TensorflowPredict": "model/Placeholder",
     "TensorflowPredictEffnetDiscogs": "serving_default_melspectrogram",
     "TensorflowPredictFSDSINet": "x",
-    "TensorflowPredictMAEST": "serving_default_melspectrogram",
+    "TensorflowPredictMAEST": "melspectrogram",
     "PitchCREPE": "frames",
     "TempoCNN": "input",
 }
@@ -23,9 +24,10 @@
     "TensorflowPredictMusiCNN": "model/Sigmoid",
     "TensorflowPredictVGGish": "model/Sigmoid",
     "TensorflowPredict2D": "model/Sigmoid",
+    "TensorflowPredict": "model/Sigmoid",
     "TensorflowPredictEffnetDiscogs": "PartitionedCall:0",
     "TensorflowPredictFSDSINet": "model/predictions/Sigmoid",
-    "TensorflowPredictMAEST": "PartitionedCall:0",
+    "TensorflowPredictMAEST": "Identity",
     "PitchCREPE": "model/classifier/Sigmoid",
     "TempoCNN": "output",
 }
@@ -66,46 +68,86 @@ def generate_single_step_algorithm(
 def generate_two_steps_algorithm(
     first_graph_filename: str,
     first_algo_name: str,
-    first_output_node: str,
+    first_algo_params: str,
     second_graph_filename: str,
     second_algo_name: str,
-    second_output_node: str,
+    second_algo_parms: str,
     sample_rate: int,
     algo_returns: str,
     audio_file: str,
+    output_name: str | None = None,
 ):
-    return (
-        f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
-        "\n"
-        f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
-        f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_output_node})\n'
-        f"embeddings = embedding_model(audio)\n"
-        "\n"
-        f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_output_node})\n'
-        f"{algo_returns} = model(embeddings)\n"
-    )
+    if second_algo_name == "TensorflowPredict2D":
+        return (
+            f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
+            "\n"
+            f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
+            f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
+            f"embeddings = embedding_model(audio)\n"
+            "\n"
+            f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
+            f"{algo_returns} = model(embeddings)\n"
+        )
+    elif second_algo_name == "TensorflowPredict":
+        assert output_name is not None, (
+            "output_name must be specified for TensorflowPredict"
+        )
+        return (
+            "from essentia import Pool\n"
+            f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
+            "\n"
+            f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
+            f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
+            f"embeddings = embedding_model(audio)\n"
+            "\n"
+            "pool = Pool()\n"
+            'pool.set("embeddings", embeddings)\n'
+            "\n"
+            f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
+            f'{algo_returns} = model(pool)["{output_name}"]\n'
+        )
+    else:
+        raise ValueError(f"Unknown second_algo_name: {second_algo_name}")
+
+
+def get_output_node_name(metadata: dict, output_purpose: str):
+    """Get the output node name for a given output purpose"""
+    outputs = metadata["schema"]["outputs"]
 
+    for output in outputs:
+        if "output_purpose" not in output:
+            continue
+        if output["output_purpose"] == output_purpose:
+            return output["name"]
 
-def get_additional_parameters(metadata: dict, output: str, algo_name: str):
-    additional_parameters = ""
+    raise ValueError(f"Output node not found for `output_purpose`: {output_purpose}")
 
+
+def get_kwargs_string(metadata: dict, output_purpose: str, algo_name: str):
+    """Get kwargs string for a given algorithm"""
+
+    kwargs_str = ""
+
+    algo_name = metadata["inference"]["algorithm"]
     input = metadata["schema"]["inputs"][0]["name"]
+
+    # Set input related params
     if input != INPUT_DEFAULTS[algo_name]:
-        additional_parameters = f', input="{input}"'
+        if algo_name == "TensorflowPredict":
+            kwargs_str += f', inputs=["{input}"]'
+        else:
+            kwargs_str += f', input="{input}"'
 
-    outputs = metadata["schema"]["outputs"]
-    for model_output in outputs:
-        if (
-            model_output["output_purpose"] == output
-            and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
-        ):
-            if metadata["name"] == "MAEST" and ":7" not in model_output["name"]:
-                # For MAEST we recommend using the embeddings from the 7th layer.
-                continue
+    # Set output related params
+    output_node_name = get_output_node_name(metadata, output_purpose)
 
-            additional_parameters += f', output="{model_output["name"]}"'
+    if output_node_name != OUTPUT_DEFAULTS[algo_name]:
+        if algo_name == "TensorflowPredict":
+            kwargs_str += f', outputs=["{output_node_name}"]'
+        else:
+            kwargs_str += f', output="{output_node_name}"'
 
-    return additional_parameters
+    return kwargs_str
 
 
 def get_metadata(task_type: str, family_name: str, model: str, metadata_base_dir=False):
@@ -150,7 +192,7 @@ def process_model(
     algo_name = metadata["inference"]["algorithm"]
 
     # check if we need a custom output node
-    additional_parameters = get_additional_parameters(metadata, output, algo_name)
+    algo_kwargs = get_kwargs_string(metadata, output, algo_name)
 
     # set algos with custom output
     algo_returns = CUSTOM_ALGO_OUTPUTS.get(algo_name, output)
@@ -160,9 +202,9 @@ def process_model(
 
     graph_filename_tgt = script_dir / graph_filename
     if download_models and (not graph_filename_tgt.exists()):
-        assert (
-            not models_base_dir
-        ), "downloading the models is incompatible with specifying `models_base_dir`"
+        assert not models_base_dir, (
+            "downloading the models is incompatible with specifying `models_base_dir`"
+        )
         try:
             script_dir.mkdir(parents=True, exist_ok=True)
             urlretrieve(metadata["link"], graph_filename_tgt)
@@ -178,6 +220,7 @@ def process_model(
         metadata_link = metadata["inference"]["embedding_model"]["link"]
         embedding_task_type = Path(metadata_link).parent.parent.stem
         embedding_family_name = Path(metadata_link).parent.stem
+
         embedding_metadata = get_metadata(
             embedding_task_type,
             embedding_family_name,
@@ -202,27 +245,36 @@ def process_model(
                 print(f"Failed downloading {metadata['link']}")
                 exit(1)
 
-        embedding_additional_parameters = get_additional_parameters(
-            embedding_metadata, "embeddings", embedding_algo_name
+        embedding_algo_kwargs = get_kwargs_string(
+            embedding_metadata,
+            "embeddings",
+            embedding_algo_name,
         )
+        output_node_name = get_output_node_name(metadata, output)
+
+        # Exceptions:
+        # - MAEST-based genre discogs models use the 12th layer instead of the 7th
+        if "Genre Discogs" in metadata["name"]:
+            embedding_algo_kwargs = embedding_algo_kwargs.replace("7", "12")
 
         script = generate_two_steps_algorithm(
             embedding_graph_filename,
             embedding_algo_name,
-            embedding_additional_parameters,
+            embedding_algo_kwargs,
             graph_filename,
             algo_name,
-            additional_parameters,
+            algo_kwargs,
             sample_rate,
             algo_returns,
             audio_file,
+            output_name=output_node_name,
         )
     else:
         script = generate_single_step_algorithm(
             graph_filename,
             algo_name,
             sample_rate,
-            additional_parameters,
+            algo_kwargs,
             algo_returns,
             audio_file,
         )

diff --git a/src/examples/python/models/generate_example_scripts.sh b/src/examples/python/models/generate_example_scripts.sh
@@ -1,4 +1,4 @@
-#! /bin/bash
+#!/bin/bash
 
 set -e
 

diff --git a/src/examples/python/models/models.yaml b/src/examples/python/models/models.yaml
@@ -71,13 +71,14 @@ feature-extractors:
     outputs:
      - embeddings
     models:
-     - discogs-maest-10s-dw-1
-     - discogs-maest-10s-fs-1
-     - discogs-maest-10s-pw-1
-     - discogs-maest-20s-pw-1
-     - discogs-maest-30s-pw-1
-     - discogs-maest-30s-pw-ts-1
-     - discogs-maest-5s-pw-1
+     - discogs-maest-5s-pw-2
+     - discogs-maest-10s-dw-2
+     - discogs-maest-10s-fs-2
+     - discogs-maest-10s-pw-2
+     - discogs-maest-20s-pw-2
+     - discogs-maest-30s-pw-2
+     - discogs-maest-30s-pw-ts-2
+     - discogs-maest-30s-pw-519l-2
 
 pitch:
   crepe:
@@ -419,4 +420,19 @@ classification-heads:
     outputs:
      - predictions
     models:
-     - genre_discogs400-discogs-effnet-1
+      - genre_discogs400-discogs-effnet-1
+      - genre_discogs400-discogs-maest-5s-pw-1
+      - genre_discogs400-discogs-maest-10s-dw-1
+      - genre_discogs400-discogs-maest-10s-fs-1
+      - genre_discogs400-discogs-maest-10s-pw-1
+      - genre_discogs400-discogs-maest-20s-pw-1
+      - genre_discogs400-discogs-maest-30s-pw-1
+      - genre_discogs400-discogs-maest-30s-pw-ts-1
+
+  genre_discogs519:
+    sample_rate: 16000
+    outputs:
+      - predictions
+    models:
+      - genre_discogs519-discogs-maest-30s-pw-519l-1
+
diff --git a/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py b/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-dw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py b/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-fs-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py b/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py b/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-20s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py b/...ssification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...fication-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py b/...fication-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-ts-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...assification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py b/...assification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-5s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]
diff --git a/...cation-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py b/...cation-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py
@@ -0,0 +1,12 @@
+from essentia import Pool
+from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-519l-2.pb", output="PartitionedCall/Identity_12")
+embeddings = embedding_model(audio)
+
+pool = Pool()
+pool.set("embeddings", embeddings)
+
+model = TensorflowPredict(graphFilename="genre_discogs519-discogs-maest-30s-pw-519l-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
+predictions = model(pool)["PartitionedCall/Identity_1"]