diff --git a/examples/common/network.c b/examples/common/network.c index e4db206..4d19939 100644 --- a/examples/common/network.c +++ b/examples/common/network.c @@ -145,9 +145,10 @@ int network_init(const char *ssid, const char *password, network_connect_cb cb) NULL, &instance_got_ip)); wifi_config.sta.threshold.authmode = WIFI_AUTH_WPA2_PSK; - if (load_from_nvs()) { + wifi_config.sta.sae_pwe_h2e = WPA3_SAE_PWE_BOTH; + /*if (load_from_nvs()) { ESP_LOGI(TAG, "Force to use wifi config from nvs"); - } else { + } else*/ { if (ssid) { memcpy(wifi_config.sta.ssid, ssid, strlen(ssid) + 1); } diff --git a/examples/voice_agent_lcd/main/CMakeLists.txt b/examples/voice_agent_lcd/main/CMakeLists.txt index 42e9711..a16f3ec 100755 --- a/examples/voice_agent_lcd/main/CMakeLists.txt +++ b/examples/voice_agent_lcd/main/CMakeLists.txt @@ -1,4 +1,4 @@ -file(GLOB_RECURSE ALL_SOURCES ./*.c) +file(GLOB_RECURSE ALL_SOURCES ./*.c ./*.cpp) idf_component_register(SRC_DIRS "." "fonts" "images") diff --git a/examples/voice_agent_lcd/main/audio_render_sink.c b/examples/voice_agent_lcd/main/audio_render_sink.c new file mode 100644 index 0000000..fc7b56d --- /dev/null +++ b/examples/voice_agent_lcd/main/audio_render_sink.c @@ -0,0 +1,152 @@ +#include "av_render.h" +#include "esp_capture.h" + +#include "av_render_default.h" +#include "codec_init.h" +#include "esp_audio_dec_default.h" +#include "esp_audio_enc_default.h" +#include "esp_capture_audio_enc.h" +#include "esp_capture_defaults.h" +#include "esp_capture_path_simple.h" +#include "esp_check.h" +#include "esp_log.h" +#include "media_lib_os.h" + +#include "audio_render_sink.h" +#include "fft.h" +#include "media.h" + +#include "audio_visualizer.h" + +typedef struct { + audio_render_handle_t audio_renderer; + av_render_handle_t av_renderer_handle; +} renderer_system_t; + + +static audio_render_handle_t real_render = NULL; +static const char *TAG = "au_render_sink"; +static renderer_system_t renderer_system; +static av_render_audio_frame_info_t frame_info = { + .sample_rate = 16000, + .channel = 2, + .bits_per_sample = 16, +}; + +static audio_render_handle_t au_render_sink_init(void *cfg, int cfg_size) { + if (cfg_size != sizeof(i2s_render_cfg_t)) { + return NULL; + } + + if (real_render == NULL) { + real_render = av_render_alloc_i2s_render((i2s_render_cfg_t *)cfg); + } + + audio_visualizer_init(); + + return (audio_render_handle_t)real_render; +} + +static int au_render_sink_open(audio_render_handle_t render, + av_render_audio_frame_info_t *info) { + int ret = 0; + if (real_render != NULL) { + ret = audio_render_open(real_render, info); + } + return ret; +} + +static int au_render_sink_write(audio_render_handle_t render, + av_render_audio_frame_t *audio_data) { + if (real_render) { + //ESP_LOGE(TAG, "Write audio data: pts=%lu, size=%d", audio_data->pts, + // audio_data->size); + audio_visualizer_processing(audio_data->data, audio_data->size); + // Write audio data to the render + audio_render_write(real_render, audio_data); + } + return 0; +} + +static int au_render_sink_get_latency(audio_render_handle_t render, + uint32_t *latency) { + return audio_render_get_latency(real_render, latency); +} + +static int au_render_sink_get_frame_info(audio_render_handle_t render, + av_render_audio_frame_info_t *info) { + return audio_render_get_frame_info(real_render, info); +} + +static int au_render_sink_set_speed(audio_render_handle_t render, float speed) { + return audio_render_set_speed(real_render, speed); +} + +static int au_render_sink_close(audio_render_handle_t render) { + int ret = 0; + if (real_render != NULL) { + ret = audio_render_close(real_render); + if (ret != 0) { + ESP_LOGE(TAG, "Failed to close render: %d", ret); + } + real_render = NULL; + + audio_visualizer_deinit(); + ESP_LOGI(TAG, "Audio render sink closed"); + } + return ret; +} + +static audio_render_handle_t +av_render_alloc_au_render_sink(i2s_render_cfg_t *i2s_cfg) { + audio_render_cfg_t cfg = { + .ops = + { + .init = au_render_sink_init, + .open = au_render_sink_open, + .write = au_render_sink_write, + .get_latency = au_render_sink_get_latency, + .set_speed = au_render_sink_set_speed, + .get_frame_info = au_render_sink_get_frame_info, + .close = au_render_sink_close, + }, + .cfg = i2s_cfg, + .cfg_size = sizeof(i2s_render_cfg_t), + }; + return audio_render_alloc_handle(&cfg); +} + +int build_player_with_sink_system() { + i2s_render_cfg_t i2s_cfg = { + .play_handle = get_playback_handle(), + }; + renderer_system.audio_renderer = av_render_alloc_au_render_sink(&i2s_cfg); + if (renderer_system.audio_renderer == NULL) { + ESP_LOGE(TAG, "Fail to create audio render"); + return -1; + } + esp_codec_dev_set_out_vol(i2s_cfg.play_handle, CONFIG_DEFAULT_PLAYBACK_VOL); + + av_render_cfg_t render_cfg = { + .audio_render = renderer_system.audio_renderer, + .audio_raw_fifo_size = 8 * 4096, + .audio_render_fifo_size = 100 * 1024, + .allow_drop_data = false, + }; + + renderer_system.av_renderer_handle = av_render_open(&render_cfg); + + if (renderer_system.av_renderer_handle == NULL) { + ESP_LOGE(TAG, "Fail to create player"); + return -1; + } + // When support AEC, reference data is from speaker right channel for ES8311 + // so must output 2 channel + av_render_set_fixed_frame_info(renderer_system.av_renderer_handle, + &frame_info); + return 0; +} + +av_render_handle_t media_get_renderer(void) { + return renderer_system.av_renderer_handle; +} \ No newline at end of file diff --git a/examples/voice_agent_lcd/main/audio_render_sink.h b/examples/voice_agent_lcd/main/audio_render_sink.h new file mode 100644 index 0000000..010ccbf --- /dev/null +++ b/examples/voice_agent_lcd/main/audio_render_sink.h @@ -0,0 +1,3 @@ +#pragma once + +int build_player_with_sink_system(); diff --git a/examples/voice_agent_lcd/main/audio_visualizer.cpp b/examples/voice_agent_lcd/main/audio_visualizer.cpp new file mode 100644 index 0000000..a0b205b --- /dev/null +++ b/examples/voice_agent_lcd/main/audio_visualizer.cpp @@ -0,0 +1,117 @@ +#include "audio_visualizer.h" +#include "esp_log.h" +#include "fft.h" +#include "media_lib_os.h" + +#include "freertos/FreeRTOS.h" +#include "freertos/semphr.h" +#include "freertos/task.h" + +#include +#include +#include + +static const char *TAG = "audio_visualizer"; +static std::queue> audio_data_queue; +static media_lib_thread_handle_t thread; +static fft_processor_t *fft_processor = NULL; +static bool running = false; +static bool fft_processor_initialized = false; +static SemaphoreHandle_t sem; + +static void fft_processor_thread(void *arg); + +int audio_visualizer_init(void) { + ESP_LOGI(TAG, "Audio visualizer initialized"); + + if (!fft_processor) { + fft_processor = (fft_processor_t *)malloc(sizeof(fft_processor_t)); + memset(fft_processor, 0, sizeof(fft_processor_t)); + // Initialize FFT processor + esp_err_t ret = fft_processor_init(fft_processor, 1024, FFT_WINDOW_HANNING); + if (ret != ESP_OK) { + ESP_LOGE(TAG, "Failed to initialize FFT processor: %s", + esp_err_to_name(ret)); + } + fft_processor_initialized = ret == ESP_OK; + + sem = xSemaphoreCreateBinary(); + if (sem == NULL) { + ESP_LOGE(TAG, "Failed to create semaphore"); + return -1; + } + running = true; + // Create a thread for processing audio data + media_lib_thread_create_from_scheduler(&thread, "fft_render", + fft_processor_thread, NULL); + } + return 0; +} + +static void fft_processor_thread(void *arg) { + while (running) { + xSemaphoreTake(sem, pdMS_TO_TICKS(1000)); + while (!audio_data_queue.empty()) { + auto audio_data = audio_data_queue.front(); + audio_data_queue.pop(); + if (fft_processor_initialized) { + // Apply FFT processing + fft_result_t *fft_result = fft_processor_process( + fft_processor, (const int16_t *)audio_data.data()); + if (fft_result) { + // Process FFT result if needed + // For example, you can log or analyze the magnitudes + ESP_LOGE(TAG, "FFT result length: %d", fft_result->length); + + fft_compute_bands_result_t *bands = + fft_result_compute_bands(fft_result, 0, 8000, 5, 16000); + + if (bands) { + // Process frequency bands if needed + ESP_LOGI(TAG, "FFT bands length: %d", bands->count); + for (int i = 0; i < bands->count; i++) { + ESP_LOGE(TAG, "Band %d: magnitude=%.2f, frequency=%.2f", i, + bands->magnitudes[i], bands->frequencies[i]); + } + // Free the bands result after processing + fft_compute_bands_result_free(bands); + } + + fft_result_free(fft_result); + } else { + ESP_LOGE(TAG, "FFT processing failed"); + } + } + } + } +} + +int audio_visualizer_processing(uint8_t *audio_data, uint32_t data_size) { + + audio_data_queue.push( + std::vector(audio_data, audio_data + data_size)); + + if (sem) { + xSemaphoreGive(sem); + } else { + ESP_LOGE(TAG, "Semaphore not initialized"); + } + + return 0; +} + +int audio_visualizer_deinit(void) { + if (fft_processor) { + fft_processor_deinit(fft_processor); + fft_processor_initialized = false; + ESP_LOGI(TAG, "FFT processor deinitialized"); + } + // close the thread + if (thread) { + running = false; + xSemaphoreGive(sem); + media_lib_thread_destroy(thread); + thread = NULL; + } + return 0; +} \ No newline at end of file diff --git a/examples/voice_agent_lcd/main/audio_visualizer.h b/examples/voice_agent_lcd/main/audio_visualizer.h new file mode 100644 index 0000000..b62fbcb --- /dev/null +++ b/examples/voice_agent_lcd/main/audio_visualizer.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int audio_visualizer_init(void); + +int audio_visualizer_processing(uint8_t *audio_data, uint32_t data_size); + +int audio_visualizer_deinit(void); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/examples/voice_agent_lcd/main/fft.c b/examples/voice_agent_lcd/main/fft.c new file mode 100644 index 0000000..d80f4f0 --- /dev/null +++ b/examples/voice_agent_lcd/main/fft.c @@ -0,0 +1,332 @@ +#include "fft.h" + +#include +#include +#include + +static const char *TAG = "FFT_PROCESSOR"; + +// Create FFT Result +fft_result_t *fft_result_create(int length) { + fft_result_t *result = malloc(sizeof(fft_result_t)); + if (!result) return NULL; + + result->length = length; + result->magnitudes = malloc(length * sizeof(float)); + if (!result->magnitudes) { + free(result); + return NULL; + } + + return result; +} + +// Free FFT Result +void fft_result_free(fft_result_t *result) { + if (result) { + if (result->magnitudes) { + free(result->magnitudes); + } + free(result); + } +} + +// Helper function to generate Hanning window +static void generate_hanning_window(float *window, int size) { + float sum_sq = 0.0f; + for (int i = 0; i < size; i++) { + window[i] = 0.5f * (1.0f - cosf(2.0f * M_PI * i / (size - 1))); + sum_sq += window[i] * window[i]; + } + + // Normalize for power conservation (RMS compensation) + float rms_compensation = sqrtf((float)size / sum_sq); + for (int i = 0; i < size; i++) { + window[i] *= rms_compensation; + } +} + +// Helper function to generate Hamming window +static void generate_hamming_window(float *window, int size) { + for (int i = 0; i < size; i++) { + window[i] = 0.54f - 0.46f * cosf(2.0f * M_PI * i / (size - 1)); + } +} + +// Initialize FFT Processor +esp_err_t fft_processor_init(fft_processor_t *processor, int buffer_size, + fft_window_type_t window_type) { + if (!processor || buffer_size <= 0) { + return ESP_ERR_INVALID_ARG; + } + + // Check if buffer size is power of 2 + if ((buffer_size & (buffer_size - 1)) != 0) { + ESP_LOGE(TAG, "Buffer size must be power of 2"); + return ESP_ERR_INVALID_ARG; + } + + processor->buffer_size = buffer_size; + processor->buffer_half_size = buffer_size / 2; + processor->window_type = window_type; + processor->zero_db_reference = 1.0f; // Standard reference level + processor->in_exponent = + -15; // Standard input exponent for int16 as per documentation + processor->initialized = false; + + // Allocate memory for buffers + processor->window = malloc(buffer_size * sizeof(float)); + processor->windowed_buffer = malloc(buffer_size * sizeof(int16_t)); + processor->fft_buffer = malloc(buffer_size * sizeof(int16_t)); + processor->output_buffer = malloc(buffer_size * sizeof(float)); + + if (!processor->window || !processor->windowed_buffer || + !processor->fft_buffer || !processor->output_buffer) { + ESP_LOGE(TAG, "Failed to allocate memory for FFT buffers"); + return ESP_ERR_NO_MEM; + } + + // Initialize window function + for (int i = 0; i < buffer_size; i++) { + processor->window[i] = 1.0f; // Default to no window + } + + switch (window_type) { + case FFT_WINDOW_HANNING: + generate_hanning_window(processor->window, buffer_size); + break; + case FFT_WINDOW_HAMMING: + generate_hamming_window(processor->window, buffer_size); + break; + case FFT_WINDOW_NONE: + default: + // Already initialized to 1.0f + break; + } + + // Initialize dl_fft handle for real FFT + processor->fft_handle = dl_rfft_s16_init(buffer_size, MALLOC_CAP_8BIT); + if (!processor->fft_handle) { + ESP_LOGE(TAG, "Failed to initialize dl_fft handle"); + return ESP_ERR_NO_MEM; + } + + processor->initialized = true; + ESP_LOGI(TAG, + "FFT processor initialized with buffer size: %d, window type: %d", + buffer_size, window_type); + + return ESP_OK; +} + +// Deinitialize FFT Processor +void fft_processor_deinit(fft_processor_t *processor) { + if (!processor) return; + + if (processor->window) { + free(processor->window); + processor->window = NULL; + } + + if (processor->windowed_buffer) { + free(processor->windowed_buffer); + processor->windowed_buffer = NULL; + } + + if (processor->fft_buffer) { + free(processor->fft_buffer); + processor->fft_buffer = NULL; + } + + if (processor->output_buffer) { + free(processor->output_buffer); + processor->output_buffer = NULL; + } + + // Deinitialize dl_fft handle + if (processor->fft_handle) { + dl_rfft_s16_deinit(processor->fft_handle); + processor->fft_handle = NULL; + } + + processor->initialized = false; + ESP_LOGI(TAG, "FFT processor deinitialized"); +} + +// Helper function to get magnitude index for frequency +static int get_magnitude_index_for_frequency(float frequency, float sample_rate, + int fft_size) { + float nyquist = sample_rate / 2.0f; + return (int)(fft_size * frequency / nyquist); +} + +// Process FFT with int16 input +fft_result_t *fft_processor_process(fft_processor_t *processor, + const int16_t *input_buffer) { + if (!processor || !processor->initialized || !input_buffer) { + ESP_LOGE(TAG, "Invalid processor or input buffer"); + return NULL; + } + + // Apply window function and convert to int16 + for (int i = 0; i < processor->buffer_size; i++) { + float windowed_value = (float)input_buffer[i] * processor->window[i]; + // Clamp to int16 range + if (windowed_value > 32767.0f) { + processor->windowed_buffer[i] = 32767; + } else if (windowed_value < -32768.0f) { + processor->windowed_buffer[i] = -32768; + } else { + processor->windowed_buffer[i] = (int16_t)windowed_value; + } + } + + // Copy windowed data to FFT buffer + memcpy(processor->fft_buffer, processor->windowed_buffer, + processor->buffer_size * sizeof(int16_t)); + + // Perform real FFT using dl_fft + dl_rfft_s16_hp_run(processor->fft_handle, processor->fft_buffer, + processor->in_exponent, &processor->fft_exponent); + + // Convert int16 FFT output to float + dl_short_to_float(processor->fft_buffer, processor->buffer_size, + processor->fft_exponent, processor->output_buffer); + + // Create result structure + fft_result_t *result = fft_result_create(processor->buffer_half_size); + if (!result) { + ESP_LOGE(TAG, "Failed to create FFT result"); + return NULL; + } + + // Calculate magnitudes and convert to dB + // For real FFT, the output is organized as: [DC, Nyquist, Re1, Im1, Re2, Im2, + // ...] According to dl_rfft.h documentation: x[0] = DC component (real), x[1] + // = Nyquist component (real) x[2] = real part of 1st component, x[3] = + // imaginary part of 1st component x[4] = real part of 2nd component, x[5] = + // imaginary part of 2nd component, etc. + for (int i = 0; i < processor->buffer_half_size; i++) { + float real, imag, magnitude; + + if (i == 0) { + // DC component (real only) + real = processor->output_buffer[0]; + imag = 0.0f; + } else if (i == processor->buffer_half_size - 1 && + processor->buffer_size % 2 == 0) { + // Nyquist frequency (real only for even buffer sizes) + real = processor->output_buffer[1]; + imag = 0.0f; + } else { + // Regular complex components + // According to dl_rfft format: x[2+2*(i-1)] = real, x[3+2*(i-1)] = imag + real = processor->output_buffer[2 + 2 * (i - 1)]; + imag = processor->output_buffer[3 + 2 * (i - 1)]; + } + + magnitude = sqrtf(real * real + imag * imag); + + // Convert to dB + if (magnitude > 0.0f) { + result->magnitudes[i] = + 20.0f * log10f(magnitude / processor->zero_db_reference); + } else { + result->magnitudes[i] = -INFINITY; + } + } + + return result; +} + +// Compute frequency bands +fft_compute_bands_result_t *fft_result_compute_bands( + const fft_result_t *fft_result, float min_frequency, float max_frequency, + int bands_count, float sample_rate) { + if (!fft_result || bands_count <= 0 || sample_rate <= 0) { + ESP_LOGE(TAG, "Invalid parameters for compute bands"); + return NULL; + } + + float nyquist_frequency = sample_rate / 2.0f; + float actual_max_frequency = fminf(nyquist_frequency, max_frequency); + + // Allocate result structure + fft_compute_bands_result_t *result = + malloc(sizeof(fft_compute_bands_result_t)); + if (!result) return NULL; + + result->count = bands_count; + result->magnitudes = malloc(bands_count * sizeof(float)); + result->frequencies = malloc(bands_count * sizeof(float)); + + if (!result->magnitudes || !result->frequencies) { + if (result->magnitudes) free(result->magnitudes); + if (result->frequencies) free(result->frequencies); + free(result); + return NULL; + } + + // Initialize arrays + for (int i = 0; i < bands_count; i++) { + result->magnitudes[i] = 0.0f; + result->frequencies[i] = 0.0f; + } + + int mag_lower_range = get_magnitude_index_for_frequency( + min_frequency, sample_rate, fft_result->length); + int mag_upper_range = get_magnitude_index_for_frequency( + actual_max_frequency, sample_rate, fft_result->length); + float ratio = (float)(mag_upper_range - mag_lower_range) / (float)bands_count; + + for (int i = 0; i < bands_count; i++) { + int mags_start_idx = (int)floorf((float)i * ratio) + mag_lower_range; + int mags_end_idx = (int)floorf((float)(i + 1) * ratio) + mag_lower_range; + + int count = mags_end_idx - mags_start_idx; + if (count > 0) { + // Calculate average magnitude for this band (convert to linear, average, + // convert back to dB) + float sum_linear = 0.0f; + int valid_count = 0; + for (int j = mags_start_idx; j < mags_end_idx && j < fft_result->length; + j++) { + if (fft_result->magnitudes[j] > -INFINITY) { + // Convert dB to linear power: power = 10^(dB/10) + float linear_magnitude = + powf(10.0f, fft_result->magnitudes[j] / 20.0f); + sum_linear += + linear_magnitude * linear_magnitude; // Power = magnitude^2 + valid_count++; + } + } + if (valid_count > 0) { + float avg_power = sum_linear / (float)valid_count; + float avg_magnitude = sqrtf(avg_power); + result->magnitudes[i] = + 20.0f * log10f(avg_magnitude); // Use standard 0dB reference + } else { + result->magnitudes[i] = -INFINITY; + } + } else if (mags_start_idx < fft_result->length) { + result->magnitudes[i] = fft_result->magnitudes[mags_start_idx]; + } + + // Compute average frequency for this band + float bandwidth = nyquist_frequency / (float)fft_result->length; + result->frequencies[i] = + (bandwidth * (float)mags_start_idx + bandwidth * (float)mags_end_idx) / + 2.0f; + } + + return result; +} + +// Free compute bands result +void fft_compute_bands_result_free(fft_compute_bands_result_t *result) { + if (result) { + if (result->magnitudes) free(result->magnitudes); + if (result->frequencies) free(result->frequencies); + free(result); + } +} diff --git a/examples/voice_agent_lcd/main/fft.h b/examples/voice_agent_lcd/main/fft.h new file mode 100644 index 0000000..71b1e2f --- /dev/null +++ b/examples/voice_agent_lcd/main/fft.h @@ -0,0 +1,140 @@ +#ifndef FFT_H +#define FFT_H + +#include "dl_fft.h" +#include "dl_rfft.h" +#include "esp_log.h" +#include "esp_system.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief FFT Processor Library for ESP32 + * + * This library implements FFT processing using dl_fft's int16 real FFT + * functions. It provides real-to-complex FFT processing optimized for audio + * analysis with 16-bit integer input. + * + * Key features: + * - Uses dl_fft's int16 real FFT functions for efficiency + * - Supports Hanning and Hamming windowing + * - Converts FFT results to magnitude spectrum in dB + * - Provides frequency band computation + * - Handles int16 to float conversion internally + */ + +// FFT Window Types +typedef enum { + FFT_WINDOW_NONE = 0, + FFT_WINDOW_HANNING, + FFT_WINDOW_HAMMING +} fft_window_type_t; + +// FFT Result structure +typedef struct { + int length; + float *magnitudes; +} fft_result_t; + +// FFT Compute Bands Result structure +typedef struct { + int count; + float *magnitudes; + float *frequencies; +} fft_compute_bands_result_t; + +// FFT Processor structure for dl_fft int16 real FFT implementation +typedef struct { + int buffer_size; // Must be power of 2 for dl_fft + int buffer_half_size; // buffer_size / 2 + fft_window_type_t window_type; + + float *window; // Window function coefficients [buffer_size] + int16_t *windowed_buffer; // Windowed int16 input data [buffer_size] + int16_t *fft_buffer; // Int16 FFT data buffer [buffer_size] + float *output_buffer; // Float output after conversion [buffer_size] + float zero_db_reference; // Reference level for dB conversion + + // dl_fft specific fields + dl_fft_s16_t *fft_handle; // dl_fft handle + int in_exponent; // Input exponent for scaling + int fft_exponent; // FFT output exponent + + bool initialized; +} fft_processor_t; + +/** + * @brief Create FFT result structure + * @param length Number of magnitude values (typically buffer_size/2) + * @return Pointer to allocated fft_result_t or NULL on failure + */ +fft_result_t *fft_result_create(int length); + +/** + * @brief Free FFT result structure + * @param result Pointer to fft_result_t to free + */ +void fft_result_free(fft_result_t *result); + +/** + * @brief Initialize FFT processor using dl_fft int16 real FFT + * @param processor Pointer to fft_processor_t structure + * @param buffer_size Size of input buffer (must be power of 2) + * @param window_type Type of windowing function to apply + * @return ESP_OK on success, error code on failure + * + * Note: This function uses dl_rfft_s16_init() internally + */ +esp_err_t fft_processor_init(fft_processor_t *processor, int buffer_size, + fft_window_type_t window_type); + +/** + * @brief Deinitialize FFT processor and free allocated memory + * @param processor Pointer to fft_processor_t structure + * + * Note: This function calls dl_rfft_s16_deinit() internally + */ +void fft_processor_deinit(fft_processor_t *processor); + +/** + * @brief Process int16 input signal using dl_fft and return magnitude spectrum + * @param processor Pointer to initialized fft_processor_t + * @param input_buffer Int16 input signal [buffer_size samples] + * @return Pointer to fft_result_t with magnitude spectrum in dB, or NULL on + * failure + * + * Processing steps: + * 1. Apply windowing function (converts to int16) + * 2. Perform real FFT using dl_rfft_s16_hp_run() + * 3. Convert output to float using dl_short_to_float() + * 4. Calculate magnitude spectrum and convert to dB + */ +fft_result_t *fft_processor_process(fft_processor_t *processor, + const int16_t *input_buffer); + +/** + * @brief Compute frequency bands from FFT magnitude spectrum + * @param fft_result FFT result containing magnitude spectrum + * @param min_frequency Minimum frequency for band analysis (Hz) + * @param max_frequency Maximum frequency for band analysis (Hz) + * @param bands_count Number of frequency bands to compute + * @param sample_rate Sample rate of the original signal (Hz) + * @return Pointer to fft_compute_bands_result_t or NULL on failure + */ +fft_compute_bands_result_t *fft_result_compute_bands( + const fft_result_t *fft_result, float min_frequency, float max_frequency, + int bands_count, float sample_rate); + +/** + * @brief Free frequency bands result structure + * @param result Pointer to fft_compute_bands_result_t to free + */ +void fft_compute_bands_result_free(fft_compute_bands_result_t *result); + +#ifdef __cplusplus +} +#endif + +#endif // FFT_H \ No newline at end of file diff --git a/examples/voice_agent_lcd/main/idf_component.yml b/examples/voice_agent_lcd/main/idf_component.yml index 5ca55e2..3acd516 100644 --- a/examples/voice_agent_lcd/main/idf_component.yml +++ b/examples/voice_agent_lcd/main/idf_component.yml @@ -1,15 +1,18 @@ -dependencies: - idf: ">=5.4" - livekit: - path: ../../../components/livekit - codec_board: - path: ../../../components/third_party/esp-webrtc-solution/components/codec_board - # Eventually, the BSP will perform all the functions of the codec_board component. - # It currently is used because codec_board is required to support AEC. - espressif/esp-box-3: ^3.0.1 - render_impl: - path: ../../../components/third_party/esp-webrtc-solution/components/av_render/render_impl - livekit_sandbox: - path: ../../../components/livekit_sandbox - common: - path: ../../common +dependencies: + idf: '>=5.4' + livekit: + path: ../../../components/livekit + codec_board: + path: ../../../components/third_party/esp-webrtc-solution/components/codec_board + # Eventually, the BSP will perform all the functions of the codec_board component. + # It currently is used because codec_board is required to support AEC. + espressif/esp-box-3: ^3.0.1 + render_impl: + path: + ../../../components/third_party/esp-webrtc-solution/components/av_render/render_impl + livekit_sandbox: + path: ../../../components/livekit_sandbox + common: + path: ../../common + espp/math: ^1.0.17 + espressif/dl_fft: ^0.2.0 diff --git a/examples/voice_agent_lcd/main/media.c b/examples/voice_agent_lcd/main/media.c index b1e42f1..6a90cdf 100644 --- a/examples/voice_agent_lcd/main/media.c +++ b/examples/voice_agent_lcd/main/media.c @@ -1,123 +1,112 @@ -#include "esp_check.h" -#include "esp_log.h" -#include "codec_init.h" -#include "esp_capture_path_simple.h" -#include "esp_capture_audio_enc.h" #include "av_render_default.h" +#include "codec_init.h" #include "esp_audio_dec_default.h" #include "esp_audio_enc_default.h" +#include "esp_capture_audio_enc.h" #include "esp_capture_defaults.h" +#include "esp_capture_path_simple.h" +#include "esp_check.h" +#include "esp_log.h" +#include "audio_render_sink.h" #include "media.h" static const char *TAG = "media"; -#define NULL_CHECK(pointer, message) \ - ESP_RETURN_ON_FALSE(pointer != NULL, -1, TAG, message) +#define NULL_CHECK(pointer, message) \ + ESP_RETURN_ON_FALSE(pointer != NULL, -1, TAG, message) typedef struct { - esp_capture_aenc_if_t *audio_encoder; - esp_capture_audio_src_if_t *audio_source; - esp_capture_path_if_t *capture_path; - esp_capture_path_handle_t capturer_handle; + esp_capture_aenc_if_t *audio_encoder; + esp_capture_audio_src_if_t *audio_source; + esp_capture_path_if_t *capture_path; + esp_capture_path_handle_t capturer_handle; } capture_system_t; typedef struct { - audio_render_handle_t audio_renderer; - av_render_handle_t av_renderer_handle; + audio_render_handle_t audio_renderer; + av_render_handle_t av_renderer_handle; } renderer_system_t; -static capture_system_t capturer_system; +static capture_system_t capturer_system; static renderer_system_t renderer_system; -static int build_capturer_system(void) -{ - // 1. Create audio encoder - capturer_system.audio_encoder = esp_capture_new_audio_encoder(); - NULL_CHECK(capturer_system.audio_encoder, "Failed to create audio encoder"); - - // 2. Create audio source - esp_codec_dev_handle_t record_handle = get_record_handle(); - NULL_CHECK(record_handle, "Failed to get record handle"); - - esp_capture_audio_aec_src_cfg_t codec_cfg = { - .record_handle = record_handle, - .channel = 4, - .channel_mask = 1 | 2 - }; - capturer_system.audio_source = esp_capture_new_audio_aec_src(&codec_cfg); - NULL_CHECK(capturer_system.audio_source, "Failed to create audio source"); - - // 3. Create capture path - esp_capture_simple_path_cfg_t path_cfg = { - .aenc = capturer_system.audio_encoder, - }; - capturer_system.capture_path = esp_capture_build_simple_path(&path_cfg); - NULL_CHECK(capturer_system.capture_path, "Failed to create capture path"); - - // 4. Create capture system - esp_capture_cfg_t cfg = { - .sync_mode = ESP_CAPTURE_SYNC_MODE_AUDIO, - .audio_src = capturer_system.audio_source, - .capture_path = capturer_system.capture_path, - }; - esp_capture_open(&cfg, &capturer_system.capturer_handle); - NULL_CHECK(capturer_system.capturer_handle, "Failed to open capture system"); - return 0; +static int build_capturer_system(void) { + // 1. Create audio encoder + capturer_system.audio_encoder = esp_capture_new_audio_encoder(); + NULL_CHECK(capturer_system.audio_encoder, "Failed to create audio encoder"); + + // 2. Create audio source + esp_codec_dev_handle_t record_handle = get_record_handle(); + NULL_CHECK(record_handle, "Failed to get record handle"); + + esp_capture_audio_aec_src_cfg_t codec_cfg = { + .record_handle = record_handle, .channel = 4, .channel_mask = 1 | 2}; + capturer_system.audio_source = esp_capture_new_audio_aec_src(&codec_cfg); + NULL_CHECK(capturer_system.audio_source, "Failed to create audio source"); + + // 3. Create capture path + esp_capture_simple_path_cfg_t path_cfg = { + .aenc = capturer_system.audio_encoder, + }; + capturer_system.capture_path = esp_capture_build_simple_path(&path_cfg); + NULL_CHECK(capturer_system.capture_path, "Failed to create capture path"); + + // 4. Create capture system + esp_capture_cfg_t cfg = { + .sync_mode = ESP_CAPTURE_SYNC_MODE_AUDIO, + .audio_src = capturer_system.audio_source, + .capture_path = capturer_system.capture_path, + }; + esp_capture_open(&cfg, &capturer_system.capturer_handle); + NULL_CHECK(capturer_system.capturer_handle, "Failed to open capture system"); + return 0; } -static int build_renderer_system(void) -{ - // 1. Create audio renderer - i2s_render_cfg_t i2s_cfg = { - .play_handle = get_playback_handle() - }; - renderer_system.audio_renderer = av_render_alloc_i2s_render(&i2s_cfg); - NULL_CHECK(renderer_system.audio_renderer, "Failed to create I2S renderer"); - - // Set initial speaker volume - esp_codec_dev_set_out_vol(i2s_cfg.play_handle, CONFIG_DEFAULT_PLAYBACK_VOL); - - // 2. Create AV renderer - // For this example, this only includes an audio renderer. - av_render_cfg_t render_cfg = { - .audio_render = renderer_system.audio_renderer, - .audio_raw_fifo_size = 8 * 4096, - .audio_render_fifo_size = 100 * 1024, - .allow_drop_data = false, - }; - renderer_system.av_renderer_handle = av_render_open(&render_cfg); - NULL_CHECK(renderer_system.av_renderer_handle, "Failed to create AV renderer"); - - // 3. Set frame info - av_render_audio_frame_info_t frame_info = { - .sample_rate = 16000, - .channel = 2, - .bits_per_sample = 16, - }; - av_render_set_fixed_frame_info(renderer_system.av_renderer_handle, &frame_info); - - return 0; +static int build_renderer_system(void) { + // 1. Create audio renderer + i2s_render_cfg_t i2s_cfg = {.play_handle = get_playback_handle()}; + renderer_system.audio_renderer = av_render_alloc_i2s_render(&i2s_cfg); + NULL_CHECK(renderer_system.audio_renderer, "Failed to create I2S renderer"); + + // Set initial speaker volume + esp_codec_dev_set_out_vol(i2s_cfg.play_handle, CONFIG_DEFAULT_PLAYBACK_VOL); + + // 2. Create AV renderer + // For this example, this only includes an audio renderer. + av_render_cfg_t render_cfg = { + .audio_render = renderer_system.audio_renderer, + .audio_raw_fifo_size = 8 * 4096, + .audio_render_fifo_size = 100 * 1024, + .allow_drop_data = false, + }; + renderer_system.av_renderer_handle = av_render_open(&render_cfg); + NULL_CHECK(renderer_system.av_renderer_handle, + "Failed to create AV renderer"); + + // 3. Set frame info + av_render_audio_frame_info_t frame_info = { + .sample_rate = 16000, + .channel = 2, + .bits_per_sample = 16, + }; + av_render_set_fixed_frame_info(renderer_system.av_renderer_handle, + &frame_info); + + return 0; } -int media_init(void) -{ - // Register default audio encoder and decoder - esp_audio_enc_register_default(); - esp_audio_dec_register_default(); +int media_init(void) { + // Register default audio encoder and decoder + esp_audio_enc_register_default(); + esp_audio_dec_register_default(); - // Build capturer and renderer systems - build_capturer_system(); - build_renderer_system(); - return 0; + // Build capturer and renderer systems + build_capturer_system(); + build_player_with_sink_system(); + return 0; } -esp_capture_handle_t media_get_capturer(void) -{ - return capturer_system.capturer_handle; +esp_capture_handle_t media_get_capturer(void) { + return capturer_system.capturer_handle; } - -av_render_handle_t media_get_renderer(void) -{ - return renderer_system.av_renderer_handle; -} \ No newline at end of file