Skip to content
Merged
164 changes: 145 additions & 19 deletions doc/sphinxdoc/models.rst

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/algorithms/machinelearning/tensorflowpredictmaest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ void TensorflowPredictMAEST::configure() {

if (parameter("patchSize").isConfigured()) {
if (graphFilename.find("discogs-maest-20s-") != std::string::npos) {
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
patchSize = 1258;
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1256, which is adequate for the 20s model.");
patchSize = 1256;
} else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) {
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
patchSize = 626;
Expand Down
8 changes: 4 additions & 4 deletions src/algorithms/machinelearning/tensorflowpredictmaest.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ class TensorflowPredictMAEST : public AlgorithmComposite {
void declareParameters() {
declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
declareParameter("input", "the name of the input node in the TensorFlow graph", "", "serving_default_melspectrogram");
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
declareParameter("input", "the name of the input node in the TensorFlow graph", "", "melspectrogram");
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
declareParameter("isTrainingName", "the name of an additional input node to indicate the model if it is in training mode or not. Leave it empty when the model does not need such input", "", "");
declareParameter("patchHopSize", "the number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
Expand Down Expand Up @@ -116,8 +116,8 @@ class TensorflowPredictMAEST : public Algorithm {
void declareParameters() {
declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "melspectrogram");
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
Expand Down
126 changes: 89 additions & 37 deletions src/examples/python/models/generate_example_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
"TensorflowPredictMusiCNN": "model/Placeholder",
"TensorflowPredictVGGish": "model/Placeholder",
"TensorflowPredict2D": "model/Placeholder",
"TensorflowPredict": "model/Placeholder",
"TensorflowPredictEffnetDiscogs": "serving_default_melspectrogram",
"TensorflowPredictFSDSINet": "x",
"TensorflowPredictMAEST": "serving_default_melspectrogram",
"TensorflowPredictMAEST": "melspectrogram",
"PitchCREPE": "frames",
"TempoCNN": "input",
}
Expand All @@ -23,9 +24,10 @@
"TensorflowPredictMusiCNN": "model/Sigmoid",
"TensorflowPredictVGGish": "model/Sigmoid",
"TensorflowPredict2D": "model/Sigmoid",
"TensorflowPredict": "model/Sigmoid",
"TensorflowPredictEffnetDiscogs": "PartitionedCall:0",
"TensorflowPredictFSDSINet": "model/predictions/Sigmoid",
"TensorflowPredictMAEST": "PartitionedCall:0",
"TensorflowPredictMAEST": "Identity",
"PitchCREPE": "model/classifier/Sigmoid",
"TempoCNN": "output",
}
Expand Down Expand Up @@ -66,46 +68,86 @@ def generate_single_step_algorithm(
def generate_two_steps_algorithm(
first_graph_filename: str,
first_algo_name: str,
first_output_node: str,
first_algo_params: str,
second_graph_filename: str,
second_algo_name: str,
second_output_node: str,
second_algo_parms: str,
sample_rate: int,
algo_returns: str,
audio_file: str,
output_name: str | None = None,
):
return (
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
"\n"
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_output_node})\n'
f"embeddings = embedding_model(audio)\n"
"\n"
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_output_node})\n'
f"{algo_returns} = model(embeddings)\n"
)
if second_algo_name == "TensorflowPredict2D":
return (
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
"\n"
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
f"embeddings = embedding_model(audio)\n"
"\n"
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
f"{algo_returns} = model(embeddings)\n"
)
elif second_algo_name == "TensorflowPredict":
assert output_name is not None, (
"output_name must be specified for TensorflowPredict"
)
return (
"from essentia import Pool\n"
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
"\n"
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
f"embeddings = embedding_model(audio)\n"
"\n"
"pool = Pool()\n"
'pool.set("embeddings", embeddings)\n'
"\n"
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
f'{algo_returns} = model(pool)["{output_name}"]\n'
)
else:
raise ValueError(f"Unknown second_algo_name: {second_algo_name}")


def get_output_node_name(metadata: dict, output_purpose: str):
"""Get the output node name for a given output purpose"""
outputs = metadata["schema"]["outputs"]

for output in outputs:
if "output_purpose" not in output:
continue
if output["output_purpose"] == output_purpose:
return output["name"]

def get_additional_parameters(metadata: dict, output: str, algo_name: str):
additional_parameters = ""
raise ValueError(f"Output node not found for `output_purpose`: {output_purpose}")


def get_kwargs_string(metadata: dict, output_purpose: str, algo_name: str):
"""Get kwargs string for a given algorithm"""

kwargs_str = ""

algo_name = metadata["inference"]["algorithm"]
input = metadata["schema"]["inputs"][0]["name"]

# Set input related params
if input != INPUT_DEFAULTS[algo_name]:
additional_parameters = f', input="{input}"'
if algo_name == "TensorflowPredict":
kwargs_str += f', inputs=["{input}"]'
else:
kwargs_str += f', input="{input}"'

outputs = metadata["schema"]["outputs"]
for model_output in outputs:
if (
model_output["output_purpose"] == output
and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
):
if metadata["name"] == "MAEST" and ":7" not in model_output["name"]:
# For MAEST we recommend using the embeddings from the 7th layer.
continue
# Set output related params
output_node_name = get_output_node_name(metadata, output_purpose)

additional_parameters += f', output="{model_output["name"]}"'
if output_node_name != OUTPUT_DEFAULTS[algo_name]:
if algo_name == "TensorflowPredict":
kwargs_str += f', outputs=["{output_node_name}"]'
else:
kwargs_str += f', output="{output_node_name}"'

return additional_parameters
return kwargs_str


def get_metadata(task_type: str, family_name: str, model: str, metadata_base_dir=False):
Expand Down Expand Up @@ -150,7 +192,7 @@ def process_model(
algo_name = metadata["inference"]["algorithm"]

# check if we need a custom output node
additional_parameters = get_additional_parameters(metadata, output, algo_name)
algo_kwargs = get_kwargs_string(metadata, output, algo_name)

# set algos with custom output
algo_returns = CUSTOM_ALGO_OUTPUTS.get(algo_name, output)
Expand All @@ -160,9 +202,9 @@ def process_model(

graph_filename_tgt = script_dir / graph_filename
if download_models and (not graph_filename_tgt.exists()):
assert (
not models_base_dir
), "downloading the models is incompatible with specifying `models_base_dir`"
assert not models_base_dir, (
"downloading the models is incompatible with specifying `models_base_dir`"
)
try:
script_dir.mkdir(parents=True, exist_ok=True)
urlretrieve(metadata["link"], graph_filename_tgt)
Expand All @@ -178,6 +220,7 @@ def process_model(
metadata_link = metadata["inference"]["embedding_model"]["link"]
embedding_task_type = Path(metadata_link).parent.parent.stem
embedding_family_name = Path(metadata_link).parent.stem

embedding_metadata = get_metadata(
embedding_task_type,
embedding_family_name,
Expand All @@ -202,27 +245,36 @@ def process_model(
print(f"Failed downloading {metadata['link']}")
exit(1)

embedding_additional_parameters = get_additional_parameters(
embedding_metadata, "embeddings", embedding_algo_name
embedding_algo_kwargs = get_kwargs_string(
embedding_metadata,
"embeddings",
embedding_algo_name,
)
output_node_name = get_output_node_name(metadata, output)

# Exceptions:
# - MAEST-based genre discogs models use the 12th layer instead of the 7th
if "Genre Discogs" in metadata["name"]:
embedding_algo_kwargs = embedding_algo_kwargs.replace("7", "12")

script = generate_two_steps_algorithm(
embedding_graph_filename,
embedding_algo_name,
embedding_additional_parameters,
embedding_algo_kwargs,
graph_filename,
algo_name,
additional_parameters,
algo_kwargs,
sample_rate,
algo_returns,
audio_file,
output_name=output_node_name,
)
else:
script = generate_single_step_algorithm(
graph_filename,
algo_name,
sample_rate,
additional_parameters,
algo_kwargs,
algo_returns,
audio_file,
)
Expand Down
2 changes: 1 addition & 1 deletion src/examples/python/models/generate_example_scripts.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /bin/bash
#!/bin/bash

set -e

Expand Down
32 changes: 24 additions & 8 deletions src/examples/python/models/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,14 @@ feature-extractors:
outputs:
- embeddings
models:
- discogs-maest-10s-dw-1
- discogs-maest-10s-fs-1
- discogs-maest-10s-pw-1
- discogs-maest-20s-pw-1
- discogs-maest-30s-pw-1
- discogs-maest-30s-pw-ts-1
- discogs-maest-5s-pw-1
- discogs-maest-5s-pw-2
- discogs-maest-10s-dw-2
- discogs-maest-10s-fs-2
- discogs-maest-10s-pw-2
- discogs-maest-20s-pw-2
- discogs-maest-30s-pw-2
- discogs-maest-30s-pw-ts-2
- discogs-maest-30s-pw-519l-2

pitch:
crepe:
Expand Down Expand Up @@ -419,4 +420,19 @@ classification-heads:
outputs:
- predictions
models:
- genre_discogs400-discogs-effnet-1
- genre_discogs400-discogs-effnet-1
- genre_discogs400-discogs-maest-5s-pw-1
- genre_discogs400-discogs-maest-10s-dw-1
- genre_discogs400-discogs-maest-10s-fs-1
- genre_discogs400-discogs-maest-10s-pw-1
- genre_discogs400-discogs-maest-20s-pw-1
- genre_discogs400-discogs-maest-30s-pw-1
- genre_discogs400-discogs-maest-30s-pw-ts-1

genre_discogs519:
sample_rate: 16000
outputs:
- predictions
models:
- genre_discogs519-discogs-maest-30s-pw-519l-1

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-dw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-fs-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-20s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-ts-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-5s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict

audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-519l-2.pb", output="PartitionedCall/Identity_12")
embeddings = embedding_model(audio)

pool = Pool()
pool.set("embeddings", embeddings)

model = TensorflowPredict(graphFilename="genre_discogs519-discogs-maest-30s-pw-519l-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
predictions = model(pool)["PartitionedCall/Identity_1"]
Loading
Loading