diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst index 2ac080db2..a77be66ba 100644 --- a/doc/sphinxdoc/models.rst +++ b/doc/sphinxdoc/models.rst @@ -144,97 +144,116 @@ Music Audio Efficient Spectrogram Transformer (`MAEST `_ pre-trained weights (``dw``), and from `PaSST `_ pre-trained weights (``pw``). Additionally, we offer a version of MAEST trained following a teacher student setup (``ts``). According to our study ``discogs-maest-30s-pw``, achieved the most competitive performance in most downstream tasks (refer to the `paper `_ for details). +The output embeddings have shape ``[batch_size, 1, tokens, embedding_size]``, where the first and second tokens (i.e., ``[0, 0, :2, :]`` ) correspond to the ``CLS`` and ``DIST`` tokens respectively, and the following ones to input signal. +To train downstream models, we recommend using the embeddings from the ``CLS`` token, or stacking the ``CLS``, ``DIST``, and the average of the input signal tokens for slightly better performance (refer to the `paper `_ for details). + +In the following examples, we extract embeddings from the 7th layer of the transformer since this is what performed the best in our downstream classification tasks. +To extract embeddings from other layers, change the ``output`` parameter according to the layer names provided in the metadata files. + Models: + .. collapse:: ⬇️ discogs-maest-30s-pw-519l + + | + + [`weights `_, `metadata `_] + + Model trained with a multi-label classification objective targeting 519 Discogs styles on an extended dataset of 4M tracks. + + Python code for embedding extraction: + + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-519l-2_embeddings.py + .. collapse:: ⬇️ discogs-maest-30s-pw | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-2_embeddings.py .. collapse:: ⬇️ discogs-maest-30s-pw-ts | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-2_embeddings.py .. collapse:: ⬇️ discogs-maest-20s-pw | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-2_embeddings.py .. collapse:: ⬇️ discogs-maest-10s-pw | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-2_embeddings.py .. collapse:: ⬇️ discogs-maest-10s-fs | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-2_embeddings.py .. collapse:: ⬇️ discogs-maest-10s-dw | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-2_embeddings.py .. collapse:: ⬇️ discogs-maest-5s-pw | - [`weights `_, `metadata `_] + [`weights `_, `metadata `_] Model trained with a multi-label classification objective targeting 400 Discogs styles. Python code for embedding extraction: - .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py + .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-2_embeddings.py -*Note: It is possible to retrieve the output of each attention layer by setting* ``output=StatefulParitionedCall:n`` *, where* ``n`` *is the index of the layer (starting from 1).* -*The output from the attention layers should be interpreted as* ``[batch_index, 1, token_number, embeddings_size]`` -*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the* ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal.* +*Note:* ``discogs-maest-30s-pw-519l`` *is an updated version of MAEST trained on a larger dataset of 4M tracks and 519 music style lables. It is expected to show slightly better performance.* + +*Note: We provide TensorFlow models operating with a fixed batch size of 1. Additionally, ONNX version of the models supporting dynamic batch sizes are provided.* + OpenL3 ^^^^^^ @@ -370,7 +389,7 @@ Music style classification by 400 styles from the Discogs taxonomy:: Models: - .. collapse:: ⬇️ genre_discogs400 + .. collapse:: ⬇️ genre_discogs400-discogs-effnet | @@ -380,6 +399,114 @@ Models: .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-effnet-1_predictions.py + .. collapse:: ⬇️ genre_discogs400-discogs-maest-5s-pw + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-10-pw + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-10s-fs + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-30s-dw + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-20s-pw + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-30s-pw + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py + + .. collapse:: ⬇️ genre_discogs400-discogs-maest-30s-pw-ts + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py + + +Genre Discogs519 +~~~~~~~~~~~~~~~~ + +.. highlight:: none + +Music style classification by 519 styles from the Discogs taxonomy:: + + Blues: Boogie Woogie, Chicago Blues, Country Blues, Delta Blues, East Coast Blues, Electric Blues, Harmonica Blues, Jump Blues, Louisiana Blues, Memphis Blues, Modern Electric Blues, Piano Blues, Piedmont Blues, Rhythm & Blues, Texas Blues + Brass & Military: Brass Band, Marches, Military, Pipe & Drum + Children's: Educational, Nursery Rhymes, Story + Classical: Baroque, Choral, Classical, Contemporary, Early, Impressionist, Medieval, Modern, Neo-Classical, Neo-Romantic, Opera, Operetta, Oratorio, Post-Modern, Renaissance, Romantic, Twelve-tone + Electronic: Abstract, Acid, Acid House, Acid Jazz, Ambient, Baltimore Club, Bassline, Beatdown, Berlin-School, Big Beat, Bleep, Breakbeat, Breakcore, Breaks, Broken Beat, Chillwave, Chiptune, Dance-pop, Dark Ambient, Darkwave, Deep House, Deep Techno, Disco, Disco Polo, Donk, Doomcore, Downtempo, Drone, Drum n Bass, Dub, Dub Techno, Dubstep, Dungeon Synth, EBM, Electro, Electro House, Electroacoustic, Electroclash, Euro House, Euro-Disco, Eurobeat, Eurodance, Experimental, Footwork, Freestyle, Future Jazz, Gabber, Garage House, Ghetto, Ghetto House, Ghettotech, Glitch, Glitch Hop, Goa Trance, Grime, Halftime, Hands Up, Happy Hardcore, Hard Beat, Hard House, Hard Techno, Hard Trance, Hardcore, Hardstyle, Harsh Noise Wall, Hi NRG, Hip Hop, Hip-House, House, IDM, Illbient, Industrial, Italo House, Italo-Disco, Italodance, J-Core, Jazzdance, Juke, Jumpstyle, Jungle, Latin, Leftfield, Lento Violento, Makina, Minimal, Minimal Techno, Modern Classical, Musique Concrète, Neo Trance, Neofolk, New Age, New Beat, New Wave, Noise, Nu-Disco, Power Electronics, Progressive Breaks, Progressive House, Progressive Trance, Psy-Trance, Rhythmic Noise, Schranz, Sound Collage, Speed Garage, Speedcore, Synth-pop, Synthwave, Tech House, Tech Trance, Techno, Trance, Tribal, Tribal House, Trip Hop, Tropical House, UK Funky, UK Garage, Vaporwave, Witch House + Folk, World, & Country: Aboriginal, African, Andalusian Classical, Andean Music, Appalachian Music, Basque Music, Bhangra, Bluegrass, Cajun, Canzone Napoletana, Carnatic, Catalan Music, Celtic, Chacarera, Chinese Classical, Chutney, Copla, Country, Cretan, Dangdut, Fado, Flamenco, Folk, Funaná, Gamelan, Ghazal, Gospel, Griot, Hawaiian, Highlife, Hillbilly, Hindustani, Honky Tonk, Indian Classical, Kaseko, Klezmer, Laïkó, Luk Thung, Maloya, Mbalax, Min'yō, Mizrahi, Nhạc Vàng, Nordic, Népzene, Ottoman Classical, Overtone Singing, Pacific, Pasodoble, Persian Classical, Phleng Phuea Chiwit, Polka, Qawwali, Raï, Rebetiko, Romani, Salegy, Sea Shanties, Soukous, Séga, Volksmusik, Western Swing, Zouk, Zydeco, Éntekhno + Funk / Soul: Afrobeat, Bayou Funk, Boogie, Contemporary R&B, Disco, Free Funk, Funk, Gogo, Gospel, Minneapolis Sound, Neo Soul, New Jack Swing, P.Funk, Psychedelic, Rhythm & Blues, Soul, Swingbeat, UK Street Soul + Hip Hop: Bass Music, Beatbox, Boom Bap, Bounce, Britcore, Cloud Rap, Conscious, Crunk, Cut-up/DJ, DJ Battle Tool, Electro, Favela Funk, G-Funk, Gangsta, Go-Go, Grime, Hardcore Hip-Hop, Hiplife, Horrorcore, Hyphy, Instrumental, Jazzy Hip-Hop, Kwaito, Miami Bass, Pop Rap, Ragga HipHop, RnB/Swing, Screw, Thug Rap, Trap, Trip Hop, Turntablism + Jazz: Afro-Cuban Jazz, Afrobeat, Avant-garde Jazz, Big Band, Bop, Bossa Nova, Cape Jazz, Contemporary Jazz, Cool Jazz, Dixieland, Easy Listening, Free Improvisation, Free Jazz, Fusion, Gypsy Jazz, Hard Bop, Jazz-Funk, Jazz-Rock, Latin Jazz, Modal, Post Bop, Ragtime, Smooth Jazz, Soul-Jazz, Space-Age, Swing + Latin: Afro-Cuban, Axé, Bachata, Baião, Batucada, Beguine, Bolero, Boogaloo, Bossanova, Carimbó, Cha-Cha, Charanga, Choro, Compas, Conjunto, Corrido, Cubano, Cumbia, Danzon, Descarga, Forró, Gaita, Guaguancó, Guajira, Guaracha, Jibaro, Lambada, MPB, Mambo, Mariachi, Marimba, Merengue, Música Criolla, Norteño, Nueva Cancion, Nueva Trova, Pachanga, Plena, Porro, Quechua, Ranchera, Reggaeton, Rumba, Salsa, Samba, Samba-Canção, Son, Son Montuno, Sonero, Tango, Tejano, Timba, Trova, Vallenato + Non-Music: Audiobook, Comedy, Dialogue, Education, Erotic, Field Recording, Health-Fitness, Interview, Monolog, Movie Effects, Poetry, Political, Promotional, Public Broadcast, Radioplay, Religious, Sermon, Sound Art, Sound Poetry, Special Effects, Speech, Spoken Word, Technical, Therapy + Pop: Ballad, Barbershop, Bollywood, Break-In, Bubblegum, Chanson, City Pop, Enka, Ethno-pop, Europop, Indie Pop, J-pop, K-pop, Karaoke, Kayōkyoku, Levenslied, Light Music, Music Hall, Novelty, Parody, Schlager, Vocal + Reggae: Calypso, Dancehall, Dub, Dub Poetry, Lovers Rock, Mento, Ragga, Reggae, Reggae Gospel, Reggae-Pop, Rocksteady, Roots Reggae, Ska, Soca, Steel Band + Rock: AOR, Acid Rock, Acoustic, Alternative Rock, Arena Rock, Art Rock, Atmospheric Black Metal, Avantgarde, Beat, Black Metal, Blues Rock, Brit Pop, Classic Rock, Coldwave, Country Rock, Crust, Death Metal, Deathcore, Deathrock, Depressive Black Metal, Doo Wop, Doom Metal, Dream Pop, Emo, Ethereal, Experimental, Folk Metal, Folk Rock, Funeral Doom Metal, Funk Metal, Garage Rock, Glam, Goregrind, Goth Rock, Gothic Metal, Grindcore, Groove Metal, Grunge, Hard Rock, Hardcore, Heavy Metal, Horror Rock, Indie Rock, Industrial, Industrial Metal, J-Rock, Jangle Pop, K-Rock, Krautrock, Lo-Fi, Lounge, Math Rock, Melodic Death Metal, Melodic Hardcore, Metalcore, Mod, NDW, Neofolk, New Wave, No Wave, Noise, Noisecore, Nu Metal, Oi, Parody, Pop Punk, Pop Rock, Pornogrind, Post Rock, Post-Hardcore, Post-Metal, Post-Punk, Power Metal, Power Pop, Power Violence, Prog Rock, Progressive Metal, Psychedelic Rock, Psychobilly, Pub Rock, Punk, Rock & Roll, Rock Opera, Rockabilly, Shoegaze, Ska, Skiffle, Sludge Metal, Soft Rock, Southern Rock, Space Rock, Speed Metal, Stoner Rock, Surf, Swamp Pop, Symphonic Rock, Technical Death Metal, Thrash, Twist, Viking Metal, Yé-Yé + Stage & Screen: Musical, Score, Soundtrack, Theme + + +.. highlight:: default + +Models: + + .. collapse:: ⬇️ genre_discogs519 + + | + + [`weights `_, `metadata `_, `demo `_] + + python code for predictions: + + .. literalinclude :: ../../src/examples/python/models/scripts/classification-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py MTG-Jamendo genre @@ -415,7 +542,6 @@ Models: | - [`weights `_, `metadata `_] Python code for predictions: diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp index 29d6a6a78..a3f523974 100644 --- a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp +++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp @@ -113,8 +113,8 @@ void TensorflowPredictMAEST::configure() { if (parameter("patchSize").isConfigured()) { if (graphFilename.find("discogs-maest-20s-") != std::string::npos) { - E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model."); - patchSize = 1258; + E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1256, which is adequate for the 20s model."); + patchSize = 1256; } else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) { E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model."); patchSize = 626; diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.h b/src/algorithms/machinelearning/tensorflowpredictmaest.h index c366b8650..dcbbf2ff3 100644 --- a/src/algorithms/machinelearning/tensorflowpredictmaest.h +++ b/src/algorithms/machinelearning/tensorflowpredictmaest.h @@ -63,8 +63,8 @@ class TensorflowPredictMAEST : public AlgorithmComposite { void declareParameters() { declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", ""); declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", ""); - declareParameter("input", "the name of the input node in the TensorFlow graph", "", "serving_default_melspectrogram"); - declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall"); + declareParameter("input", "the name of the input node in the TensorFlow graph", "", "melspectrogram"); + declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity"); declareParameter("isTrainingName", "the name of an additional input node to indicate the model if it is in training mode or not. Leave it empty when the model does not need such input", "", ""); declareParameter("patchHopSize", "the number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875); declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard"); @@ -116,8 +116,8 @@ class TensorflowPredictMAEST : public Algorithm { void declareParameters() { declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", ""); declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", ""); - declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram"); - declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall"); + declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "melspectrogram"); + declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity"); declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", ""); declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875); declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard"); diff --git a/src/examples/python/models/generate_example_scripts.py b/src/examples/python/models/generate_example_scripts.py index 358ae59c3..0dc3c5aed 100644 --- a/src/examples/python/models/generate_example_scripts.py +++ b/src/examples/python/models/generate_example_scripts.py @@ -12,9 +12,10 @@ "TensorflowPredictMusiCNN": "model/Placeholder", "TensorflowPredictVGGish": "model/Placeholder", "TensorflowPredict2D": "model/Placeholder", + "TensorflowPredict": "model/Placeholder", "TensorflowPredictEffnetDiscogs": "serving_default_melspectrogram", "TensorflowPredictFSDSINet": "x", - "TensorflowPredictMAEST": "serving_default_melspectrogram", + "TensorflowPredictMAEST": "melspectrogram", "PitchCREPE": "frames", "TempoCNN": "input", } @@ -23,9 +24,10 @@ "TensorflowPredictMusiCNN": "model/Sigmoid", "TensorflowPredictVGGish": "model/Sigmoid", "TensorflowPredict2D": "model/Sigmoid", + "TensorflowPredict": "model/Sigmoid", "TensorflowPredictEffnetDiscogs": "PartitionedCall:0", "TensorflowPredictFSDSINet": "model/predictions/Sigmoid", - "TensorflowPredictMAEST": "PartitionedCall:0", + "TensorflowPredictMAEST": "Identity", "PitchCREPE": "model/classifier/Sigmoid", "TempoCNN": "output", } @@ -66,46 +68,86 @@ def generate_single_step_algorithm( def generate_two_steps_algorithm( first_graph_filename: str, first_algo_name: str, - first_output_node: str, + first_algo_params: str, second_graph_filename: str, second_algo_name: str, - second_output_node: str, + second_algo_parms: str, sample_rate: int, algo_returns: str, audio_file: str, + output_name: str | None = None, ): - return ( - f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n" - "\n" - f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n' - f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_output_node})\n' - f"embeddings = embedding_model(audio)\n" - "\n" - f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_output_node})\n' - f"{algo_returns} = model(embeddings)\n" - ) + if second_algo_name == "TensorflowPredict2D": + return ( + f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n" + "\n" + f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n' + f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n' + f"embeddings = embedding_model(audio)\n" + "\n" + f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n' + f"{algo_returns} = model(embeddings)\n" + ) + elif second_algo_name == "TensorflowPredict": + assert output_name is not None, ( + "output_name must be specified for TensorflowPredict" + ) + return ( + "from essentia import Pool\n" + f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n" + "\n" + f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n' + f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n' + f"embeddings = embedding_model(audio)\n" + "\n" + "pool = Pool()\n" + 'pool.set("embeddings", embeddings)\n' + "\n" + f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n' + f'{algo_returns} = model(pool)["{output_name}"]\n' + ) + else: + raise ValueError(f"Unknown second_algo_name: {second_algo_name}") + + +def get_output_node_name(metadata: dict, output_purpose: str): + """Get the output node name for a given output purpose""" + outputs = metadata["schema"]["outputs"] + for output in outputs: + if "output_purpose" not in output: + continue + if output["output_purpose"] == output_purpose: + return output["name"] -def get_additional_parameters(metadata: dict, output: str, algo_name: str): - additional_parameters = "" + raise ValueError(f"Output node not found for `output_purpose`: {output_purpose}") + +def get_kwargs_string(metadata: dict, output_purpose: str, algo_name: str): + """Get kwargs string for a given algorithm""" + + kwargs_str = "" + + algo_name = metadata["inference"]["algorithm"] input = metadata["schema"]["inputs"][0]["name"] + + # Set input related params if input != INPUT_DEFAULTS[algo_name]: - additional_parameters = f', input="{input}"' + if algo_name == "TensorflowPredict": + kwargs_str += f', inputs=["{input}"]' + else: + kwargs_str += f', input="{input}"' - outputs = metadata["schema"]["outputs"] - for model_output in outputs: - if ( - model_output["output_purpose"] == output - and model_output["name"] != OUTPUT_DEFAULTS[algo_name] - ): - if metadata["name"] == "MAEST" and ":7" not in model_output["name"]: - # For MAEST we recommend using the embeddings from the 7th layer. - continue + # Set output related params + output_node_name = get_output_node_name(metadata, output_purpose) - additional_parameters += f', output="{model_output["name"]}"' + if output_node_name != OUTPUT_DEFAULTS[algo_name]: + if algo_name == "TensorflowPredict": + kwargs_str += f', outputs=["{output_node_name}"]' + else: + kwargs_str += f', output="{output_node_name}"' - return additional_parameters + return kwargs_str def get_metadata(task_type: str, family_name: str, model: str, metadata_base_dir=False): @@ -150,7 +192,7 @@ def process_model( algo_name = metadata["inference"]["algorithm"] # check if we need a custom output node - additional_parameters = get_additional_parameters(metadata, output, algo_name) + algo_kwargs = get_kwargs_string(metadata, output, algo_name) # set algos with custom output algo_returns = CUSTOM_ALGO_OUTPUTS.get(algo_name, output) @@ -160,9 +202,9 @@ def process_model( graph_filename_tgt = script_dir / graph_filename if download_models and (not graph_filename_tgt.exists()): - assert ( - not models_base_dir - ), "downloading the models is incompatible with specifying `models_base_dir`" + assert not models_base_dir, ( + "downloading the models is incompatible with specifying `models_base_dir`" + ) try: script_dir.mkdir(parents=True, exist_ok=True) urlretrieve(metadata["link"], graph_filename_tgt) @@ -178,6 +220,7 @@ def process_model( metadata_link = metadata["inference"]["embedding_model"]["link"] embedding_task_type = Path(metadata_link).parent.parent.stem embedding_family_name = Path(metadata_link).parent.stem + embedding_metadata = get_metadata( embedding_task_type, embedding_family_name, @@ -202,27 +245,36 @@ def process_model( print(f"Failed downloading {metadata['link']}") exit(1) - embedding_additional_parameters = get_additional_parameters( - embedding_metadata, "embeddings", embedding_algo_name + embedding_algo_kwargs = get_kwargs_string( + embedding_metadata, + "embeddings", + embedding_algo_name, ) + output_node_name = get_output_node_name(metadata, output) + + # Exceptions: + # - MAEST-based genre discogs models use the 12th layer instead of the 7th + if "Genre Discogs" in metadata["name"]: + embedding_algo_kwargs = embedding_algo_kwargs.replace("7", "12") script = generate_two_steps_algorithm( embedding_graph_filename, embedding_algo_name, - embedding_additional_parameters, + embedding_algo_kwargs, graph_filename, algo_name, - additional_parameters, + algo_kwargs, sample_rate, algo_returns, audio_file, + output_name=output_node_name, ) else: script = generate_single_step_algorithm( graph_filename, algo_name, sample_rate, - additional_parameters, + algo_kwargs, algo_returns, audio_file, ) diff --git a/src/examples/python/models/generate_example_scripts.sh b/src/examples/python/models/generate_example_scripts.sh index b53575952..ebbd8b4df 100755 --- a/src/examples/python/models/generate_example_scripts.sh +++ b/src/examples/python/models/generate_example_scripts.sh @@ -1,4 +1,4 @@ -#! /bin/bash +#!/bin/bash set -e diff --git a/src/examples/python/models/models.yaml b/src/examples/python/models/models.yaml index ae866ea13..eff198104 100644 --- a/src/examples/python/models/models.yaml +++ b/src/examples/python/models/models.yaml @@ -71,13 +71,14 @@ feature-extractors: outputs: - embeddings models: - - discogs-maest-10s-dw-1 - - discogs-maest-10s-fs-1 - - discogs-maest-10s-pw-1 - - discogs-maest-20s-pw-1 - - discogs-maest-30s-pw-1 - - discogs-maest-30s-pw-ts-1 - - discogs-maest-5s-pw-1 + - discogs-maest-5s-pw-2 + - discogs-maest-10s-dw-2 + - discogs-maest-10s-fs-2 + - discogs-maest-10s-pw-2 + - discogs-maest-20s-pw-2 + - discogs-maest-30s-pw-2 + - discogs-maest-30s-pw-ts-2 + - discogs-maest-30s-pw-519l-2 pitch: crepe: @@ -419,4 +420,19 @@ classification-heads: outputs: - predictions models: - - genre_discogs400-discogs-effnet-1 + - genre_discogs400-discogs-effnet-1 + - genre_discogs400-discogs-maest-5s-pw-1 + - genre_discogs400-discogs-maest-10s-dw-1 + - genre_discogs400-discogs-maest-10s-fs-1 + - genre_discogs400-discogs-maest-10s-pw-1 + - genre_discogs400-discogs-maest-20s-pw-1 + - genre_discogs400-discogs-maest-30s-pw-1 + - genre_discogs400-discogs-maest-30s-pw-ts-1 + + genre_discogs519: + sample_rate: 16000 + outputs: + - predictions + models: + - genre_discogs519-discogs-maest-30s-pw-519l-1 + diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py new file mode 100644 index 000000000..7cb6aef09 --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-dw-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-dw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py new file mode 100644 index 000000000..708c3169c --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-fs-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-fs-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py new file mode 100644 index 000000000..de03f3445 --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-10s-pw-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py new file mode 100644 index 000000000..0e9d97c6e --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-20s-pw-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-20s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py new file mode 100644 index 000000000..a4fff757e --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py new file mode 100644 index 000000000..929adbe4b --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-30s-pw-ts-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-ts-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py new file mode 100644 index 000000000..bbd8de7b2 --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs400/genre_discogs400-discogs-maest-5s-pw-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-5s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/classification-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py b/src/examples/python/models/scripts/classification-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py new file mode 100644 index 000000000..8070d9d13 --- /dev/null +++ b/src/examples/python/models/scripts/classification-heads/genre_discogs519/genre_discogs519-discogs-maest-30s-pw-519l-1_predictions.py @@ -0,0 +1,12 @@ +from essentia import Pool +from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-519l-2.pb", output="PartitionedCall/Identity_12") +embeddings = embedding_model(audio) + +pool = Pool() +pool.set("embeddings", embeddings) + +model = TensorflowPredict(graphFilename="genre_discogs519-discogs-maest-30s-pw-519l-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"]) +predictions = model(pool)["PartitionedCall/Identity_1"] diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-2_embeddings.py index 24c2c636c..708dc7dcb 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-2_embeddings.py index 59c1c891d..0452beb54 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-2_embeddings.py index aabe99d14..714c2d3e6 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-2_embeddings.py index 3cd8d93b0..71fb7d5fb 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-2_embeddings.py index 8e26a43ef..7a3496639 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-519l-2_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-519l-2_embeddings.py new file mode 100644 index 000000000..0ef802278 --- /dev/null +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-519l-2_embeddings.py @@ -0,0 +1,5 @@ +from essentia.standard import MonoLoader, TensorflowPredictMAEST + +audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() +model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-519l-2.pb", output="PartitionedCall/Identity_7") +embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-2_embeddings.py similarity index 82% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-2_embeddings.py index 2dfbaa9f6..abd12251a 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-2_embeddings.py similarity index 83% rename from src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py rename to src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-2_embeddings.py index 3c747bd63..bab3f6d8e 100644 --- a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py +++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-2_embeddings.py @@ -1,5 +1,5 @@ from essentia.standard import MonoLoader, TensorflowPredictMAEST audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)() -model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-1.pb", output="StatefulPartitionedCall:7") +model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-2.pb", output="PartitionedCall/Identity_7") embeddings = model(audio) diff --git a/src/examples/python/models/test_scripts.py b/src/examples/python/models/test_scripts.py new file mode 100644 index 000000000..08705ad0a --- /dev/null +++ b/src/examples/python/models/test_scripts.py @@ -0,0 +1,18 @@ +from glob import glob +from subprocess import run + + +def test_scripts(): + for script in glob("scripts/*/*/*.py"): + print(f"Testing {script}") + + process = run(["python3", script], capture_output=True) + if process.returncode != 0: + traceback = process.stderr.decode() + print(f"Error trace: {traceback}\n") + else: + print("Success\n") + + +if __name__ == "__main__": + test_scripts() diff --git a/src/examples/python/models/test_scripts.sh b/src/examples/python/models/test_scripts.sh new file mode 100755 index 000000000..1abd9a819 --- /dev/null +++ b/src/examples/python/models/test_scripts.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +# Generate example scripts pointing to the Essentia models local directory and an example audio +# file and execute them. + +essentia_models_dir=/path/to/essentia-models/ +audio_file=/path/to/example_track.mp3 + +python3 generate_example_scripts.py \ + --force \ + --metadata-base-dir ${essentia_models_dir} \ + --models-base-dir ${essentia_models_dir} \ + --audio-file ${audio_file} + +TF_CPP_MIN_LOG_LEVEL=3 python3 test_scripts.py