ESPHome  2024.4.1
micro_wake_word.h
Go to the documentation of this file.
1 #pragma once
2 
8 //
9 #ifndef CLANG_TIDY
10 
11 #ifdef USE_ESP_IDF
12 
14 #include "esphome/core/component.h"
16 
18 
19 #include <tensorflow/lite/core/c/common.h>
20 #include <tensorflow/lite/micro/micro_interpreter.h>
21 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
22 
23 namespace esphome {
24 namespace micro_wake_word {
25 
26 // The following are dictated by the preprocessor model
27 //
28 // The number of features the audio preprocessor generates per slice
29 static const uint8_t PREPROCESSOR_FEATURE_SIZE = 40;
30 // How frequently the preprocessor generates a new set of features
31 static const uint8_t FEATURE_STRIDE_MS = 20;
32 // Duration of each slice used as input into the preprocessor
33 static const uint8_t FEATURE_DURATION_MS = 30;
34 // Audio sample frequency in hertz
35 static const uint16_t AUDIO_SAMPLE_FREQUENCY = 16000;
36 // The number of old audio samples that are saved to be part of the next feature window
37 static const uint16_t HISTORY_SAMPLES_TO_KEEP =
38  ((FEATURE_DURATION_MS - FEATURE_STRIDE_MS) * (AUDIO_SAMPLE_FREQUENCY / 1000));
39 // The number of new audio samples to receive to be included with the next feature window
40 static const uint16_t NEW_SAMPLES_TO_GET = (FEATURE_STRIDE_MS * (AUDIO_SAMPLE_FREQUENCY / 1000));
41 // The total number of audio samples included in the feature window
42 static const uint16_t SAMPLE_DURATION_COUNT = FEATURE_DURATION_MS * AUDIO_SAMPLE_FREQUENCY / 1000;
43 // Number of bytes in memory needed for the preprocessor arena
44 static const uint32_t PREPROCESSOR_ARENA_SIZE = 9528;
45 
46 // The following configure the streaming wake word model
47 //
48 // The number of audio slices to process before accepting a positive detection
49 static const uint8_t MIN_SLICES_BEFORE_DETECTION = 74;
50 
51 // Number of bytes in memory needed for the streaming wake word model
52 static const uint32_t STREAMING_MODEL_ARENA_SIZE = 64000;
53 static const uint32_t STREAMING_MODEL_VARIABLE_ARENA_SIZE = 1024;
54 
55 enum State {
62 };
63 
64 class MicroWakeWord : public Component {
65  public:
66  void setup() override;
67  void loop() override;
68  float get_setup_priority() const override;
69  void dump_config() override;
70 
71  void start();
72  void stop();
73 
74  bool is_running() const { return this->state_ != State::IDLE; }
75 
76  bool initialize_models();
77 
78  std::string get_wake_word() { return this->wake_word_; }
79 
80  // Increasing either of these will reduce the rate of false acceptances while increasing the false rejection rate
81  void set_probability_cutoff(float probability_cutoff) { this->probability_cutoff_ = probability_cutoff; }
82  void set_sliding_window_average_size(size_t size);
83 
84  void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; }
85 
87 
88  void set_model_start(const uint8_t *model_start) { this->model_start_ = model_start; }
89  void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; }
90 
91  protected:
92  void set_state_(State state);
93  int read_microphone_();
94 
95  const uint8_t *model_start_;
96  std::string wake_word_;
97 
102 
103  std::unique_ptr<RingBuffer> ring_buffer_;
104 
105  int16_t *input_buffer_;
106 
109  tflite::MicroInterpreter *streaming_interpreter_{nullptr};
110  tflite::MicroInterpreter *preprocessor_interperter_{nullptr};
111 
113  size_t last_n_index_{0};
114 
117 
118  // When the wake word detection first starts or after the word has been detected once, we ignore this many audio
119  // feature slices before accepting a positive detection again
120  int16_t ignore_windows_{-MIN_SLICES_BEFORE_DETECTION};
121 
122  uint8_t *streaming_var_arena_{nullptr};
123  uint8_t *streaming_tensor_arena_{nullptr};
124  uint8_t *preprocessor_tensor_arena_{nullptr};
125  int8_t *new_features_data_{nullptr};
126 
127  tflite::MicroResourceVariables *mrv_{nullptr};
128 
129  // Stores audio fed into feature generator preprocessor
131 
132  bool detected_{false};
133 
141  bool detect_wake_word_();
142 
144  bool slice_available_();
145 
151  bool update_features_();
152 
161  bool generate_single_feature_(const int16_t *audio_data, int audio_data_size,
162  int8_t feature_output[PREPROCESSOR_FEATURE_SIZE]);
163 
169 
177  bool stride_audio_samples_(int16_t **audio_samples);
178 
180  bool register_preprocessor_ops_(tflite::MicroMutableOpResolver<18> &op_resolver);
181 
183  bool register_streaming_ops_(tflite::MicroMutableOpResolver<17> &op_resolver);
184 };
185 
186 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<MicroWakeWord> {
187  public:
188  void play(Ts... x) override { this->parent_->start(); }
189 };
190 
191 template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<MicroWakeWord> {
192  public:
193  void play(Ts... x) override { this->parent_->stop(); }
194 };
195 
196 template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<MicroWakeWord> {
197  public:
198  bool check(Ts... x) override { return this->parent_->is_running(); }
199 };
200 
201 } // namespace micro_wake_word
202 } // namespace esphome
203 
204 #endif // USE_ESP_IDF
205 
206 #endif // CLANG_TIDY
tflite::MicroInterpreter * streaming_interpreter_
bool detect_wake_word_()
Detects if wake word has been said.
void set_model_start(const uint8_t *model_start)
bool register_preprocessor_ops_(tflite::MicroMutableOpResolver< 18 > &op_resolver)
Returns true if successfully registered the preprocessor&#39;s TensorFlow operations. ...
Trigger< std::string > * wake_word_detected_trigger_
uint16_t x
Definition: tt21100.cpp:17
std::unique_ptr< RingBuffer > ring_buffer_
std::vector< float > recent_streaming_probabilities_
Helper class to request loop() to be called as fast as possible.
Definition: helpers.h:603
HighFrequencyLoopRequester high_freq_
void set_probability_cutoff(float probability_cutoff)
bool stride_audio_samples_(int16_t **audio_samples)
Strides the audio samples by keeping the last 10 ms of the previous slice.
tflite::MicroInterpreter * preprocessor_interperter_
bool update_features_()
Shifts previous feature slices over by one and generates a new slice of features. ...
float perform_streaming_inference_()
Performs inference over the most recent feature slice with the streaming model.
microphone::Microphone * microphone_
Base class for all automation conditions.
Definition: automation.h:74
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 17 > &op_resolver)
Returns true if successfully registered the streaming model&#39;s TensorFlow operations.
void set_wake_word(const std::string &wake_word)
Trigger< std::string > * get_wake_word_detected_trigger() const
bool generate_single_feature_(const int16_t *audio_data, int audio_data_size, int8_t feature_output[PREPROCESSOR_FEATURE_SIZE])
Generates features from audio samples.
This is a workaround until we can figure out a way to get the tflite-micro idf component code availab...
Definition: a01nyub.cpp:7
void set_microphone(microphone::Microphone *microphone)
bool slice_available_()
Returns true if there are enough audio samples in the buffer to generate another slice of features...
Helper class to easily give an object a parent of type T.
Definition: helpers.h:515
bool state
Definition: fan.h:34
tflite::MicroResourceVariables * mrv_