19 #include <tensorflow/lite/core/c/common.h> 20 #include <tensorflow/lite/micro/micro_interpreter.h> 21 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h> 26 namespace micro_wake_word {
28 static const char *
const TAG =
"micro_wake_word";
30 static const size_t SAMPLE_RATE_HZ = 16000;
31 static const size_t BUFFER_LENGTH = 500;
32 static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
33 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000;
37 static const LogString *micro_wake_word_state_to_string(
State state) {
40 return LOG_STR(
"IDLE");
42 return LOG_STR(
"START_MICROPHONE");
44 return LOG_STR(
"STARTING_MICROPHONE");
46 return LOG_STR(
"DETECTING_WAKE_WORD");
48 return LOG_STR(
"STOP_MICROPHONE");
50 return LOG_STR(
"STOPPING_MICROPHONE");
52 return LOG_STR(
"UNKNOWN");
57 ESP_LOGCONFIG(TAG,
"microWakeWord:");
58 ESP_LOGCONFIG(TAG,
" Wake Word: %s", this->
get_wake_word().c_str());
64 ESP_LOGCONFIG(TAG,
"Setting up microWakeWord...");
67 ESP_LOGE(TAG,
"Failed to initialize models");
75 ESP_LOGW(TAG,
"Could not allocate input buffer");
82 ESP_LOGW(TAG,
"Could not allocate ring buffer");
87 ESP_LOGCONFIG(TAG,
"Micro Wake Word initialized");
92 if (bytes_read == 0) {
98 if (bytes_free < bytes_read) {
100 "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " 101 "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
102 bytes_free, bytes_read);
115 ESP_LOGD(TAG,
"Starting Microphone");
128 ESP_LOGD(TAG,
"Wake Word Detected");
134 ESP_LOGD(TAG,
"Stopping Microphone");
153 ESP_LOGW(TAG,
"Wake word component is marked as failed. Please check setup logs");
157 ESP_LOGW(TAG,
"Wake word is already running");
165 ESP_LOGW(TAG,
"Wake word is already stopped");
169 ESP_LOGW(TAG,
"Wake word is already stopping");
176 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->
state_)),
177 LOG_STR_ARG(micro_wake_word_state_to_string(state)));
188 ESP_LOGE(TAG,
"Could not allocate the streaming model's tensor arena.");
194 ESP_LOGE(TAG,
"Could not allocate the streaming model variable's tensor arena.");
200 ESP_LOGE(TAG,
"Could not allocate the audio preprocessor model's tensor arena.");
206 ESP_LOGE(TAG,
"Could not allocate the audio features buffer.");
212 ESP_LOGE(TAG,
"Could not allocate the audio preprocessor's buffer.");
218 ESP_LOGE(TAG,
"Wake word's audio preprocessor model's schema is not supported");
224 ESP_LOGE(TAG,
"Wake word's streaming model's schema is not supported");
228 static tflite::MicroMutableOpResolver<18> preprocessor_op_resolver;
229 static tflite::MicroMutableOpResolver<17> streaming_op_resolver;
236 tflite::MicroAllocator *ma =
237 tflite::MicroAllocator::Create(this->
streaming_var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
238 this->
mrv_ = tflite::MicroResourceVariables::Create(ma, 15);
240 static tflite::MicroInterpreter static_preprocessor_interpreter(
243 static tflite::MicroInterpreter static_streaming_interpreter(this->
streaming_model_, streaming_op_resolver,
245 STREAMING_MODEL_ARENA_SIZE, this->
mrv_);
252 ESP_LOGE(TAG,
"Failed to allocate tensors for the audio preprocessor");
256 ESP_LOGE(TAG,
"Failed to allocate tensors for the streaming model");
262 if ((input->dims->size != 3) || (input->dims->data[0] != 1) || (input->dims->data[0] != 1) ||
263 (input->dims->data[1] != 1) || (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
264 ESP_LOGE(TAG,
"Wake word detection model tensor input dimensions is not 1x1x%u", input->dims->data[2]);
268 if (input->type != kTfLiteInt8) {
269 ESP_LOGE(TAG,
"Wake word detection model tensor input is not int8.");
275 if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {
276 ESP_LOGE(TAG,
"Wake word detection model tensor output dimensions is not 1x1.");
279 if (output->type != kTfLiteUInt8) {
280 ESP_LOGE(TAG,
"Wake word detection model tensor input is not uint8.");
291 int16_t *audio_samples =
nullptr;
307 size_t bytes_to_copy = input->bytes;
309 memcpy((
void *) (tflite::GetTensorData<int8_t>(input)), (
const void *) (this->
new_features_data_), bytes_to_copy);
311 uint32_t prior_invoke =
millis();
314 if (invoke_status != kTfLiteOk) {
315 ESP_LOGW(TAG,
"Streaming Interpreter Invoke failed");
319 ESP_LOGV(TAG,
"Streaming Inference Latency=%u ms", (
millis() - prior_invoke));
323 return static_cast<float>(output->data.uint8[0]) / 255.0;
357 for (
auto &prob : this->recent_streaming_probabilities_) {
361 ESP_LOGD(TAG,
"Wake word sliding average probability is %.3f and most recent probability is %.3f",
362 sliding_window_average, streaming_prob);
377 return available > (NEW_SAMPLES_TO_GET *
sizeof(int16_t));
387 HISTORY_SAMPLES_TO_KEEP *
sizeof(int16_t));
392 NEW_SAMPLES_TO_GET *
sizeof(int16_t), pdMS_TO_TICKS(200));
394 if (bytes_read == 0) {
395 ESP_LOGE(TAG,
"Could not read data from Ring Buffer");
396 }
else if (bytes_read < NEW_SAMPLES_TO_GET *
sizeof(int16_t)) {
397 ESP_LOGD(TAG,
"Partial Read of Data by Model");
398 ESP_LOGD(TAG,
"Could only read %d bytes when required %d bytes ", bytes_read,
399 (
int) (NEW_SAMPLES_TO_GET *
sizeof(int16_t)));
408 int8_t feature_output[PREPROCESSOR_FEATURE_SIZE]) {
411 std::copy_n(audio_data, audio_data_size, tflite::GetTensorData<int16_t>(input));
414 ESP_LOGE(TAG,
"Failed to preprocess audio for local wake word.");
417 std::memcpy(feature_output, tflite::GetTensorData<int8_t>(output), PREPROCESSOR_FEATURE_SIZE *
sizeof(int8_t));
423 if (op_resolver.AddReshape() != kTfLiteOk)
425 if (op_resolver.AddCast() != kTfLiteOk)
427 if (op_resolver.AddStridedSlice() != kTfLiteOk)
429 if (op_resolver.AddConcatenation() != kTfLiteOk)
431 if (op_resolver.AddMul() != kTfLiteOk)
433 if (op_resolver.AddAdd() != kTfLiteOk)
435 if (op_resolver.AddDiv() != kTfLiteOk)
437 if (op_resolver.AddMinimum() != kTfLiteOk)
439 if (op_resolver.AddMaximum() != kTfLiteOk)
441 if (op_resolver.AddWindow() != kTfLiteOk)
443 if (op_resolver.AddFftAutoScale() != kTfLiteOk)
445 if (op_resolver.AddRfft() != kTfLiteOk)
447 if (op_resolver.AddEnergy() != kTfLiteOk)
449 if (op_resolver.AddFilterBank() != kTfLiteOk)
451 if (op_resolver.AddFilterBankSquareRoot() != kTfLiteOk)
453 if (op_resolver.AddFilterBankSpectralSubtraction() != kTfLiteOk)
455 if (op_resolver.AddPCAN() != kTfLiteOk)
457 if (op_resolver.AddFilterBankLog() != kTfLiteOk)
464 if (op_resolver.AddCallOnce() != kTfLiteOk)
466 if (op_resolver.AddVarHandle() != kTfLiteOk)
468 if (op_resolver.AddReshape() != kTfLiteOk)
470 if (op_resolver.AddReadVariable() != kTfLiteOk)
472 if (op_resolver.AddStridedSlice() != kTfLiteOk)
474 if (op_resolver.AddConcatenation() != kTfLiteOk)
476 if (op_resolver.AddAssignVariable() != kTfLiteOk)
478 if (op_resolver.AddConv2D() != kTfLiteOk)
480 if (op_resolver.AddMul() != kTfLiteOk)
482 if (op_resolver.AddAdd() != kTfLiteOk)
484 if (op_resolver.AddMean() != kTfLiteOk)
486 if (op_resolver.AddFullyConnected() != kTfLiteOk)
488 if (op_resolver.AddLogistic() != kTfLiteOk)
490 if (op_resolver.AddQuantize() != kTfLiteOk)
492 if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk)
494 if (op_resolver.AddAveragePool2D() != kTfLiteOk)
496 if (op_resolver.AddMaxPool2D() != kTfLiteOk)
505 #endif // USE_ESP_IDF tflite::MicroInterpreter * streaming_interpreter_
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
bool detect_wake_word_()
Detects if wake word has been said.
int16_t * preprocessor_audio_buffer_
bool register_preprocessor_ops_(tflite::MicroMutableOpResolver< 18 > &op_resolver)
Returns true if successfully registered the preprocessor's TensorFlow operations. ...
const tflite::Model * preprocessor_model_
void set_state_(State state)
Trigger< std::string > * wake_word_detected_trigger_
std::string get_wake_word()
std::unique_ptr< RingBuffer > ring_buffer_
const uint8_t * model_start_
std::vector< float > recent_streaming_probabilities_
An STL allocator that uses SPI RAM.
uint8_t * streaming_var_arena_
float get_setup_priority() const override
uint8_t * streaming_tensor_arena_
HighFrequencyLoopRequester high_freq_
size_t sliding_window_average_size_
bool stride_audio_samples_(int16_t **audio_samples)
Strides the audio samples by keeping the last 10 ms of the previous slice.
uint32_t IRAM_ATTR HOT millis()
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
tflite::MicroInterpreter * preprocessor_interperter_
bool update_features_()
Shifts previous feature slices over by one and generates a new slice of features. ...
float perform_streaming_inference_()
Performs inference over the most recent feature slice with the streaming model.
void dump_config() override
microphone::Microphone * microphone_
void start()
Start running the loop continuously.
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 17 > &op_resolver)
Returns true if successfully registered the streaming model's TensorFlow operations.
void stop()
Stop running the loop continuously.
int8_t * new_features_data_
virtual size_t read(int16_t *buf, size_t len)=0
virtual void mark_failed()
Mark this component as failed.
bool generate_single_feature_(const int16_t *audio_data, int audio_data_size, int8_t feature_output[PREPROCESSOR_FEATURE_SIZE])
Generates features from audio samples.
This is a workaround until we can figure out a way to get the tflite-micro idf component code availab...
bool slice_available_()
Returns true if there are enough audio samples in the buffer to generate another slice of features...
uint8_t * preprocessor_tensor_arena_
void set_sliding_window_average_size(size_t size)
static std::unique_ptr< RingBuffer > create(size_t len)
const tflite::Model * streaming_model_
float probability_cutoff_
const unsigned char G_AUDIO_PREPROCESSOR_INT8_TFLITE[]
tflite::MicroResourceVariables * mrv_