api/voice__assistant_8cpp_source.html

 #include "voice_assistant.h"

 #ifdef USE_VOICE_ASSISTANT

 #include "esphome/core/log.h"

 #include <cstdio>

 namespace esphome {
 namespace voice_assistant {

 static const char *const TAG = "voice_assistant";

 #ifdef SAMPLE_RATE_HZ
 #undef SAMPLE_RATE_HZ
 #endif

 static const size_t SAMPLE_RATE_HZ = 16000;
 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000;  // 32ms * 16kHz / 1000ms
 static const size_t BUFFER_SIZE = 1024 * SAMPLE_RATE_HZ / 1000;
 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
 static const size_t RECEIVE_SIZE = 1024;
 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;

 float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }

 bool VoiceAssistant::start_udp_socket_() {
   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
   if (this->socket_ == nullptr) {
     ESP_LOGE(TAG, "Could not create socket");
     this->mark_failed();
     return false;
   }
   int enable = 1;
   int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
   if (err != 0) {
     ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
     // we can still continue
   }
   err = this->socket_->setblocking(false);
   if (err != 0) {
     ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
     this->mark_failed();
     return false;
   }

 #ifdef USE_SPEAKER
   if (this->speaker_ != nullptr) {
     struct sockaddr_storage server;

     socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
     if (sl == 0) {
       ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
       this->mark_failed();
       return false;
     }

     err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
     if (err != 0) {
       ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
       this->mark_failed();
       return false;
     }
   }
 #endif
   this->udp_socket_running_ = true;
   return true;
 }

 void VoiceAssistant::setup() {
   ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");

   global_voice_assistant = this;

 #ifdef USE_SPEAKER
   if (this->speaker_ != nullptr) {
     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
     if (this->speaker_buffer_ == nullptr) {
       ESP_LOGW(TAG, "Could not allocate speaker buffer");
       this->mark_failed();
       return;
     }
   }
 #endif

   ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
   this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
   if (this->input_buffer_ == nullptr) {
     ESP_LOGW(TAG, "Could not allocate input buffer");
     this->mark_failed();
     return;
   }

 #ifdef USE_ESP_ADF
   this->vad_instance_ = vad_create(VAD_MODE_4);
 #endif

   this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
   if (this->ring_buffer_ == nullptr) {
     ESP_LOGW(TAG, "Could not allocate ring buffer");
     this->mark_failed();
     return;
   }

   ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
   this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
   if (send_buffer_ == nullptr) {
     ESP_LOGW(TAG, "Could not allocate send buffer");
     this->mark_failed();
     return;
   }
 }

 int VoiceAssistant::read_microphone_() {
   size_t bytes_read = 0;
   if (this->mic_->is_running()) {  // Read audio into input buffer
     bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
     if (bytes_read == 0) {
       memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
       return 0;
     }
     // Write audio into ring buffer
     this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
   } else {
     ESP_LOGD(TAG, "microphone not running");
   }
   return bytes_read;
 }

 void VoiceAssistant::loop() {
   if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
       this->state_ != State::STOPPING_MICROPHONE) {
     if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
       this->set_state_(State::STOP_MICROPHONE, State::IDLE);
     } else {
       this->set_state_(State::IDLE, State::IDLE);
     }
     this->continuous_ = false;
     this->signal_stop_();
     return;
   }
   switch (this->state_) {
     case State::IDLE: {
       if (this->continuous_ && this->desired_state_ == State::IDLE) {
         this->idle_trigger_->trigger();

         this->ring_buffer_->reset();
 #ifdef USE_ESP_ADF
         if (this->use_wake_word_) {
           this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
         } else
 #endif
         {
           this->set_state_(State::START_PIPELINE, State::START_MICROPHONE);
         }
       } else {
         this->high_freq_.stop();
       }
       break;
     }
     case State::START_MICROPHONE: {
       ESP_LOGD(TAG, "Starting Microphone");
       memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
       memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
       this->mic_->start();
       this->high_freq_.start();
       this->set_state_(State::STARTING_MICROPHONE);
       break;
     }
     case State::STARTING_MICROPHONE: {
       if (this->mic_->is_running()) {
         this->set_state_(this->desired_state_);
       }
       break;
     }
 #ifdef USE_ESP_ADF
     case State::WAIT_FOR_VAD: {
       this->read_microphone_();
       ESP_LOGD(TAG, "Waiting for speech...");
       this->set_state_(State::WAITING_FOR_VAD);
       break;
     }
     case State::WAITING_FOR_VAD: {
       size_t bytes_read = this->read_microphone_();
       if (bytes_read > 0) {
         vad_state_t vad_state =
             vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
         if (vad_state == VAD_SPEECH) {
           if (this->vad_counter_ < this->vad_threshold_) {
             this->vad_counter_++;
           } else {
             ESP_LOGD(TAG, "VAD detected speech");
             this->set_state_(State::START_PIPELINE, State::STREAMING_MICROPHONE);

             // Reset for next time
             this->vad_counter_ = 0;
           }
         } else {
           if (this->vad_counter_ > 0) {
             this->vad_counter_--;
           }
         }
       }
       break;
     }
 #endif
     case State::START_PIPELINE: {
       this->read_microphone_();
       ESP_LOGD(TAG, "Requesting start...");
       uint32_t flags = 0;
       if (this->use_wake_word_)
         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD;
       if (this->silence_detection_)
         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD;
       api::VoiceAssistantAudioSettings audio_settings;
       audio_settings.noise_suppression_level = this->noise_suppression_level_;
       audio_settings.auto_gain = this->auto_gain_;
       audio_settings.volume_multiplier = this->volume_multiplier_;

       api::VoiceAssistantRequest msg;
       msg.start = true;
       msg.conversation_id = this->conversation_id_;
       msg.flags = flags;
       msg.audio_settings = audio_settings;
       msg.wake_word_phrase = this->wake_word_;
       this->wake_word_ = "";

       if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) {
         ESP_LOGW(TAG, "Could not request start");
         this->error_trigger_->trigger("not-connected", "Could not request start");
         this->continuous_ = false;
         this->set_state_(State::IDLE, State::IDLE);
         break;
       }
       this->set_state_(State::STARTING_PIPELINE);
       this->set_timeout("reset-conversation_id", 5 * 60 * 1000, [this]() { this->conversation_id_ = ""; });
       break;
     }
     case State::STARTING_PIPELINE: {
       this->read_microphone_();
       break;  // State changed when udp server port received
     }
     case State::STREAMING_MICROPHONE: {
       this->read_microphone_();
       size_t available = this->ring_buffer_->available();
       while (available >= SEND_BUFFER_SIZE) {
         size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
         if (this->audio_mode_ == AUDIO_MODE_API) {
           api::VoiceAssistantAudio msg;
           msg.data.assign((char *) this->send_buffer_, read_bytes);
           this->api_client_->send_voice_assistant_audio(msg);
         } else {
           if (!this->udp_socket_running_) {
             if (!this->start_udp_socket_()) {
               this->set_state_(State::STOP_MICROPHONE, State::IDLE);
               break;
             }
           }
           this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
                                 sizeof(this->dest_addr_));
         }
         available = this->ring_buffer_->available();
       }

       break;
     }
     case State::STOP_MICROPHONE: {
       if (this->mic_->is_running()) {
         this->mic_->stop();
         this->set_state_(State::STOPPING_MICROPHONE);
       } else {
         this->set_state_(this->desired_state_);
       }
       break;
     }
     case State::STOPPING_MICROPHONE: {
       if (this->mic_->is_stopped()) {
         this->set_state_(this->desired_state_);
       }
       break;
     }
     case State::AWAITING_RESPONSE: {
       break;  // State changed by events
     }
     case State::STREAMING_RESPONSE: {
       bool playing = false;
 #ifdef USE_SPEAKER
       if (this->speaker_ != nullptr) {
         ssize_t received_len = 0;
         if (this->audio_mode_ == AUDIO_MODE_UDP) {
           if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
             received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
             if (received_len > 0) {
               this->speaker_buffer_index_ += received_len;
               this->speaker_buffer_size_ += received_len;
               this->speaker_bytes_received_ += received_len;
             }
           } else {
             ESP_LOGD(TAG, "Receive buffer full");
           }
         }
         // Build a small buffer of audio before sending to the speaker
         bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
         if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
           this->write_speaker_();
         if (this->wait_for_stream_end_) {
           this->cancel_timeout("playing");
           if (end_of_stream) {
             ESP_LOGD(TAG, "End of audio stream received");
             this->cancel_timeout("speaker-timeout");
             this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
           }
           break;  // We dont want to timeout here as the STREAM_END event will take care of that.
         }
         playing = this->speaker_->is_running();
       }
 #endif
 #ifdef USE_MEDIA_PLAYER
       if (this->media_player_ != nullptr) {
         playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_PLAYING);
       }
 #endif
       if (playing) {
         this->set_timeout("playing", 2000, [this]() {
           this->cancel_timeout("speaker-timeout");
           this->set_state_(State::IDLE, State::IDLE);
         });
       }
       break;
     }
     case State::RESPONSE_FINISHED: {
 #ifdef USE_SPEAKER
       if (this->speaker_ != nullptr) {
         if (this->speaker_buffer_size_ > 0) {
           this->write_speaker_();
           break;
         }
         if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
           break;
         }
         ESP_LOGD(TAG, "Speaker has finished outputting all audio");
         this->speaker_->stop();
         this->cancel_timeout("speaker-timeout");
         this->cancel_timeout("playing");
         this->speaker_buffer_size_ = 0;
         this->speaker_buffer_index_ = 0;
         this->speaker_bytes_received_ = 0;
         memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
         this->wait_for_stream_end_ = false;
         this->stream_ended_ = false;

         this->tts_stream_end_trigger_->trigger();
       }
 #endif
       this->set_state_(State::IDLE, State::IDLE);
       break;
     }
     default:
       break;
   }
 }

 #ifdef USE_SPEAKER
 void VoiceAssistant::write_speaker_() {
   if (this->speaker_buffer_size_ > 0) {
     size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_);
     if (written > 0) {
       memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
       this->speaker_buffer_size_ -= written;
       this->speaker_buffer_index_ -= written;
       this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
     } else {
       ESP_LOGD(TAG, "Speaker buffer full, trying again next loop");
     }
   }
 }
 #endif

 void VoiceAssistant::client_subscription(api::APIConnection *client, bool subscribe) {
   if (!subscribe) {
     if (this->api_client_ == nullptr || client != this->api_client_) {
       ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
       return;
     }
     this->api_client_ = nullptr;
     this->client_disconnected_trigger_->trigger();
     return;
   }

   if (this->api_client_ != nullptr) {
     ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
     ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
     ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
     return;
   }

   this->api_client_ = client;
   this->client_connected_trigger_->trigger();
 }

 static const LogString *voice_assistant_state_to_string(State state) {
   switch (state) {
     case State::IDLE:
       return LOG_STR("IDLE");
     case State::START_MICROPHONE:
       return LOG_STR("START_MICROPHONE");
     case State::STARTING_MICROPHONE:
       return LOG_STR("STARTING_MICROPHONE");
     case State::WAIT_FOR_VAD:
       return LOG_STR("WAIT_FOR_VAD");
     case State::WAITING_FOR_VAD:
       return LOG_STR("WAITING_FOR_VAD");
     case State::START_PIPELINE:
       return LOG_STR("START_PIPELINE");
     case State::STARTING_PIPELINE:
       return LOG_STR("STARTING_PIPELINE");
     case State::STREAMING_MICROPHONE:
       return LOG_STR("STREAMING_MICROPHONE");
     case State::STOP_MICROPHONE:
       return LOG_STR("STOP_MICROPHONE");
     case State::STOPPING_MICROPHONE:
       return LOG_STR("STOPPING_MICROPHONE");
     case State::AWAITING_RESPONSE:
       return LOG_STR("AWAITING_RESPONSE");
     case State::STREAMING_RESPONSE:
       return LOG_STR("STREAMING_RESPONSE");
     case State::RESPONSE_FINISHED:
       return LOG_STR("RESPONSE_FINISHED");
     default:
       return LOG_STR("UNKNOWN");
   }
 };

 void VoiceAssistant::set_state_(State state) {
   State old_state = this->state_;
   this->state_ = state;
   ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
            LOG_STR_ARG(voice_assistant_state_to_string(state)));
 }

 void VoiceAssistant::set_state_(State state, State desired_state) {
   this->set_state_(state);
   this->desired_state_ = desired_state;
   ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
 }

 void VoiceAssistant::failed_to_start() {
   ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
   this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
   this->set_state_(State::STOP_MICROPHONE, State::IDLE);
 }

 void VoiceAssistant::start_streaming() {
   if (this->state_ != State::STARTING_PIPELINE) {
     this->signal_stop_();
     return;
   }

   ESP_LOGD(TAG, "Client started, streaming microphone");
   this->audio_mode_ = AUDIO_MODE_API;

   if (this->mic_->is_running()) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
   }
 }

 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
   if (this->state_ != State::STARTING_PIPELINE) {
     this->signal_stop_();
     return;
   }

   ESP_LOGD(TAG, "Client started, streaming microphone");
   this->audio_mode_ = AUDIO_MODE_UDP;

   memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
   if (this->dest_addr_.ss_family == AF_INET) {
     ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
   }
 #if LWIP_IPV6
   else if (this->dest_addr_.ss_family == AF_INET6) {
     ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
   }
 #endif
   else {
     ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
     return;
   }

   if (this->mic_->is_running()) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
   }
 }

 void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
   if (this->api_client_ == nullptr) {
     ESP_LOGE(TAG, "No API client connected");
     this->set_state_(State::IDLE, State::IDLE);
     this->continuous_ = false;
     return;
   }
   if (this->state_ == State::IDLE) {
     this->continuous_ = continuous;
     this->silence_detection_ = silence_detection;
     this->ring_buffer_->reset();
 #ifdef USE_ESP_ADF
     if (this->use_wake_word_) {
       this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
     } else
 #endif
     {
       this->set_state_(State::START_PIPELINE, State::START_MICROPHONE);
     }
   }
 }

 void VoiceAssistant::request_stop() {
   this->continuous_ = false;

   switch (this->state_) {
     case State::IDLE:
       break;
     case State::START_MICROPHONE:
     case State::STARTING_MICROPHONE:
     case State::WAIT_FOR_VAD:
     case State::WAITING_FOR_VAD:
     case State::START_PIPELINE:
       this->set_state_(State::STOP_MICROPHONE, State::IDLE);
       break;
     case State::STARTING_PIPELINE:
     case State::STREAMING_MICROPHONE:
       this->signal_stop_();
       this->set_state_(State::STOP_MICROPHONE, State::IDLE);
       break;
     case State::STOP_MICROPHONE:
     case State::STOPPING_MICROPHONE:
       this->desired_state_ = State::IDLE;
       break;
     case State::AWAITING_RESPONSE:
     case State::STREAMING_RESPONSE:
     case State::RESPONSE_FINISHED:
       break;  // Let the incoming audio stream finish then it will go to idle.
   }
 }

 void VoiceAssistant::signal_stop_() {
   memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
   if (this->api_client_ == nullptr) {
     return;
   }
   ESP_LOGD(TAG, "Signaling stop...");
   api::VoiceAssistantRequest msg;
   msg.start = false;
   this->api_client_->send_voice_assistant_request(msg);
 }

 void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
   ESP_LOGD(TAG, "Event Type: %d", msg.event_type);
   switch (msg.event_type) {
     case api::enums::VOICE_ASSISTANT_RUN_START:
       ESP_LOGD(TAG, "Assist Pipeline running");
       this->defer([this]() { this->start_trigger_->trigger(); });
       break;
     case api::enums::VOICE_ASSISTANT_WAKE_WORD_START:
       break;
     case api::enums::VOICE_ASSISTANT_WAKE_WORD_END: {
       ESP_LOGD(TAG, "Wake word detected");
       this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
       break;
     }
     case api::enums::VOICE_ASSISTANT_STT_START:
       ESP_LOGD(TAG, "STT started");
       this->defer([this]() { this->listening_trigger_->trigger(); });
       break;
     case api::enums::VOICE_ASSISTANT_STT_END: {
       std::string text;
       for (auto arg : msg.data) {
         if (arg.name == "text") {
           text = std::move(arg.value);
         }
       }
       if (text.empty()) {
         ESP_LOGW(TAG, "No text in STT_END event");
         return;
       }
       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
       this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
       break;
     }
     case api::enums::VOICE_ASSISTANT_INTENT_START:
       ESP_LOGD(TAG, "Intent started");
       this->defer([this]() { this->intent_start_trigger_->trigger(); });
       break;
     case api::enums::VOICE_ASSISTANT_INTENT_END: {
       for (auto arg : msg.data) {
         if (arg.name == "conversation_id") {
           this->conversation_id_ = std::move(arg.value);
         }
       }
       this->defer([this]() { this->intent_end_trigger_->trigger(); });
       break;
     }
     case api::enums::VOICE_ASSISTANT_TTS_START: {
       std::string text;
       for (auto arg : msg.data) {
         if (arg.name == "text") {
           text = std::move(arg.value);
         }
       }
       if (text.empty()) {
         ESP_LOGW(TAG, "No text in TTS_START event");
         return;
       }
       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
       this->defer([this, text]() {
         this->tts_start_trigger_->trigger(text);
 #ifdef USE_SPEAKER
         this->speaker_->start();
 #endif
       });
       break;
     }
     case api::enums::VOICE_ASSISTANT_TTS_END: {
       std::string url;
       for (auto arg : msg.data) {
         if (arg.name == "url") {
           url = std::move(arg.value);
         }
       }
       if (url.empty()) {
         ESP_LOGW(TAG, "No url in TTS_END event");
         return;
       }
       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
       this->defer([this, url]() {
 #ifdef USE_MEDIA_PLAYER
         if (this->media_player_ != nullptr) {
           this->media_player_->make_call().set_media_url(url).perform();
         }
 #endif
         this->tts_end_trigger_->trigger(url);
       });
       State new_state = this->local_output_ ? State::STREAMING_RESPONSE : State::IDLE;
       this->set_state_(new_state, new_state);
       break;
     }
     case api::enums::VOICE_ASSISTANT_RUN_END: {
       ESP_LOGD(TAG, "Assist Pipeline ended");
       if (this->state_ == State::STREAMING_MICROPHONE) {
         this->ring_buffer_->reset();
 #ifdef USE_ESP_ADF
         if (this->use_wake_word_) {
           // No need to stop the microphone since we didn't use the speaker
           this->set_state_(State::WAIT_FOR_VAD, State::WAITING_FOR_VAD);
         } else
 #endif
         {
           this->set_state_(State::IDLE, State::IDLE);
         }
       } else if (this->state_ == State::AWAITING_RESPONSE) {
         // No TTS start event ("nevermind")
         this->set_state_(State::IDLE, State::IDLE);
       }
       this->defer([this]() { this->end_trigger_->trigger(); });
       break;
     }
     case api::enums::VOICE_ASSISTANT_ERROR: {
       std::string code = "";
       std::string message = "";
       for (auto arg : msg.data) {
         if (arg.name == "code") {
           code = std::move(arg.value);
         } else if (arg.name == "message") {
           message = std::move(arg.value);
         }
       }
       if (code == "wake-word-timeout" || code == "wake_word_detection_aborted") {
         // Don't change state here since either the "tts-end" or "run-end" events will do it.
         return;
       } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
         // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
         this->defer([this, code, message]() {
           this->request_stop();
           this->error_trigger_->trigger(code, message);
         });
         return;
       }
       ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
       if (this->state_ != State::IDLE) {
         this->signal_stop_();
         this->set_state_(State::STOP_MICROPHONE, State::IDLE);
       }
       this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
       break;
     }
     case api::enums::VOICE_ASSISTANT_TTS_STREAM_START: {
 #ifdef USE_SPEAKER
       this->wait_for_stream_end_ = true;
       ESP_LOGD(TAG, "TTS stream start");
       this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
 #endif
       break;
     }
     case api::enums::VOICE_ASSISTANT_TTS_STREAM_END: {
 #ifdef USE_SPEAKER
       this->stream_ended_ = true;
       ESP_LOGD(TAG, "TTS stream end");
 #endif
       break;
     }
     case api::enums::VOICE_ASSISTANT_STT_VAD_START:
       ESP_LOGD(TAG, "Starting STT by VAD");
       this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
       break;
     case api::enums::VOICE_ASSISTANT_STT_VAD_END:
       ESP_LOGD(TAG, "STT by VAD end");
       this->set_state_(State::STOP_MICROPHONE, State::AWAITING_RESPONSE);
       this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
       break;
     default:
       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type);
       break;
   }
 }

 void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
 #ifdef USE_SPEAKER  // We should never get to this function if there is no speaker anyway
   if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
     memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
     this->speaker_buffer_index_ += msg.data.length();
     this->speaker_buffer_size_ += msg.data.length();
     this->speaker_bytes_received_ += msg.data.length();
   } else {
     ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
   }
 #endif
 }

 VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)

 }  // namespace voice_assistant
 }  // namespace esphome

 #endif  // USE_VOICE_ASSISTANT
esphome::voice_assistant::VoiceAssistant::tts_stream_start_trigger_
Trigger * tts_stream_start_trigger_
Definition: voice_assistant.h:170

esphome::api::VoiceAssistantAudioSettings
Definition: api_pb2.h:1691

esphome::speaker::Speaker::play
virtual size_t play(const uint8_t *data, size_t length)=0

esphome::api::VoiceAssistantRequest::wake_word_phrase
std::string wake_word_phrase
Definition: api_pb2.h:1711

esphome::voice_assistant::State::RESPONSE_FINISHED

esphome::speaker::Speaker::is_running
bool is_running() const
Definition: speaker.h:23

esphome::voice_assistant::VoiceAssistant::wait_for_stream_end_
bool wait_for_stream_end_
Definition: voice_assistant.h:193

esphome::voice_assistant::VoiceAssistant::local_output_
bool local_output_
Definition: voice_assistant.h:200

esphome::setup_priority::AFTER_CONNECTION
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27

esphome::voice_assistant::VoiceAssistant::audio_mode_
AudioMode audio_mode_
Definition: voice_assistant.h:229

esphome::api::VoiceAssistantEventResponse
Definition: api_pb2.h:1745

esphome::Component::cancel_timeout
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition: component.cpp:73

esphome::voice_assistant::VoiceAssistant::intent_end_trigger_
Trigger * intent_end_trigger_
Definition: voice_assistant.h:162

esphome::voice_assistant::State::AWAITING_RESPONSE

esphome::voice_assistant::VoiceAssistant::high_freq_
HighFrequencyLoopRequester high_freq_
Definition: voice_assistant.h:206

esphome::voice_assistant::global_voice_assistant
VoiceAssistant * global_voice_assistant
Definition: voice_assistant.cpp:744

esphome::voice_assistant::VoiceAssistant::conversation_id_
std::string conversation_id_
Definition: voice_assistant.h:202

esphome::socket::set_sockaddr_any
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition: socket.cpp:53

esphome::voice_assistant::VoiceAssistant::wake_word_
std::string wake_word_
Definition: voice_assistant.h:204

esphome::api::VoiceAssistantRequest::conversation_id
std::string conversation_id
Definition: api_pb2.h:1708

esphome::voice_assistant::VoiceAssistant::api_client_
api::APIConnection * api_client_
Definition: voice_assistant.h:183

esphome::voice_assistant::VoiceAssistant::desired_state_
State desired_state_
Definition: voice_assistant.h:227

esphome::voice_assistant::VoiceAssistant::send_buffer_
uint8_t * send_buffer_
Definition: voice_assistant.h:220

esphome::voice_assistant::VoiceAssistant::get_setup_priority
float get_setup_priority() const override
Definition: voice_assistant.cpp:25

esphome::voice_assistant::VoiceAssistant::vad_instance_
vad_handle_t vad_instance_
Definition: voice_assistant.h:209

esphome::api::enums::VOICE_ASSISTANT_INTENT_START
Definition: api_pb2.h:183

esphome::voice_assistant::State::STREAMING_RESPONSE

esphome::api::VoiceAssistantAudioSettings::auto_gain
uint32_t auto_gain
Definition: api_pb2.h:1694

esphome::voice_assistant::VoiceAssistant::socket_
std::unique_ptr< socket::Socket > socket_
Definition: voice_assistant.h:159

esphome::api::VoiceAssistantRequest::flags
uint32_t flags
Definition: api_pb2.h:1709

esphome::api::VoiceAssistantAudioSettings::noise_suppression_level
uint32_t noise_suppression_level
Definition: api_pb2.h:1693

esphome::voice_assistant::State::WAIT_FOR_VAD

sockaddr_storage::ss_family
sa_family_t ss_family
Definition: headers.h:92

esphome::voice_assistant::VoiceAssistant::tts_start_trigger_
Trigger< std::string > * tts_start_trigger_
Definition: voice_assistant.h:176

esphome::Component::set_timeout
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition: component.cpp:69

esphome::api::enums::VOICE_ASSISTANT_RUN_START
Definition: api_pb2.h:179

esphome::ExternalRAMAllocator
An STL allocator that uses SPI RAM.
Definition: helpers.h:645

esphome::voice_assistant::State
State
Definition: voice_assistant.h:41

esphome::Component::defer
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition: component.cpp:130

esphome::voice_assistant::State::STOPPING_MICROPHONE

sockaddr_in
Definition: headers.h:61

esphome::voice_assistant::VoiceAssistant::tts_end_trigger_
Trigger< std::string > * tts_end_trigger_
Definition: voice_assistant.h:175

socklen_t
uint32_t socklen_t
Definition: headers.h:97

esphome::api::VoiceAssistantRequest::audio_settings
VoiceAssistantAudioSettings audio_settings
Definition: api_pb2.h:1710

esphome::voice_assistant::VoiceAssistant::mic_
microphone::Microphone * mic_
Definition: voice_assistant.h:185

esphome::api::VoiceAssistantEventResponse::event_type
enums::VoiceAssistantEvent event_type
Definition: api_pb2.h:1747

esphome::voice_assistant::AUDIO_MODE_API
Definition: voice_assistant.h:59

esphome::api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD
Definition: api_pb2.h:174

esphome::voice_assistant::VoiceAssistant::client_subscription
void client_subscription(api::APIConnection *client, bool subscribe)
Definition: voice_assistant.cpp:380

esphome::speaker::Speaker::has_buffered_data
virtual bool has_buffered_data() const =0

esphome::api::VoiceAssistantEventResponse::data
std::vector< VoiceAssistantEventData > data
Definition: api_pb2.h:1748

esphome::api::APIConnection::get_client_combined_info
std::string get_client_combined_info() const
Definition: api_connection.h:200

esphome::voice_assistant::VoiceAssistant::speaker_bytes_received_
size_t speaker_bytes_received_
Definition: voice_assistant.h:192

esphome::media_player::MediaPlayer::state
MediaPlayerState state
Definition: media_player.h:71

esphome::voice_assistant::VoiceAssistant::udp_socket_running_
bool udp_socket_running_
Definition: voice_assistant.h:230

esphome::microphone::Microphone::is_running
bool is_running() const
Definition: microphone.h:25

esphome::voice_assistant::VoiceAssistant::speaker_buffer_
uint8_t * speaker_buffer_
Definition: voice_assistant.h:189

esphome::voice_assistant::State::STREAMING_MICROPHONE

esphome::voice_assistant::VoiceAssistant::noise_suppression_level_
uint8_t noise_suppression_level_
Definition: voice_assistant.h:216

esphome::voice_assistant::VoiceAssistant::use_wake_word_
bool use_wake_word_
Definition: voice_assistant.h:215

esphome::Trigger::trigger
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95

esphome::voice_assistant::VoiceAssistant
Definition: voice_assistant.h:62

esphome::voice_assistant::VoiceAssistant::start_trigger_
Trigger * start_trigger_
Definition: voice_assistant.h:166

esphome::api::enums::VOICE_ASSISTANT_STT_VAD_START
Definition: api_pb2.h:189

esphome::voice_assistant::State::WAITING_FOR_VAD

esphome::voice_assistant::VoiceAssistant::auto_gain_
uint8_t auto_gain_
Definition: voice_assistant.h:217

esphome::voice_assistant::VoiceAssistant::listening_trigger_
Trigger * listening_trigger_
Definition: voice_assistant.h:164

esphome::api::enums::VOICE_ASSISTANT_STT_START
Definition: api_pb2.h:181

esphome::voice_assistant::VoiceAssistant::write_speaker_
void write_speaker_()
Definition: voice_assistant.cpp:365

esphome::api::enums::VOICE_ASSISTANT_TTS_START
Definition: api_pb2.h:185

esphome::voice_assistant::VoiceAssistant::media_player_
media_player::MediaPlayer * media_player_
Definition: voice_assistant.h:197

esphome::voice_assistant::VoiceAssistant::input_buffer_
int16_t * input_buffer_
Definition: voice_assistant.h:221

esphome::HighFrequencyLoopRequester::start
void start()
Start running the loop continuously.
Definition: helpers.cpp:547

esphome::voice_assistant::VoiceAssistant::client_connected_trigger_
Trigger * client_connected_trigger_
Definition: voice_assistant.h:180

voice_assistant.h

esphome::voice_assistant::AUDIO_MODE_UDP
Definition: voice_assistant.h:58

esphome::api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition: api_pb2.h:175

esphome::voice_assistant::VoiceAssistant::start_udp_socket_
bool start_udp_socket_()
Definition: voice_assistant.cpp:27

esphome::voice_assistant::State::STOP_MICROPHONE

sockaddr
Definition: headers.h:83

esphome::voice_assistant::VoiceAssistant::end_trigger_
Trigger * end_trigger_
Definition: voice_assistant.h:165

esphome::microphone::Microphone::start
virtual void start()=0

esphome::api::APIServerConnectionBase::send_voice_assistant_request
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
Definition: api_pb2_service.cpp:468

esphome::api::enums::VOICE_ASSISTANT_INTENT_END
Definition: api_pb2.h:184

esphome::api::enums::VOICE_ASSISTANT_WAKE_WORD_START
Definition: api_pb2.h:187

esphome::voice_assistant::VoiceAssistant::speaker_
speaker::Speaker * speaker_
Definition: voice_assistant.h:188

esphome::api::enums::MEDIA_PLAYER_STATE_PLAYING
Definition: api_pb2.h:149

esphome::voice_assistant::VoiceAssistant::failed_to_start
void failed_to_start()
Definition: voice_assistant.cpp:448

esphome::HighFrequencyLoopRequester::stop
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:553

sockaddr_storage
Definition: headers.h:90

esphome::media_player::MediaPlayer::make_call
MediaPlayerCall make_call()
Definition: media_player.h:74

esphome::voice_assistant::VoiceAssistant::set_state_
void set_state_(State state)
Definition: voice_assistant.cpp:435

esphome::voice_assistant::VoiceAssistant::continuous_
bool continuous_
Definition: voice_assistant.h:223

esphome::voice_assistant::State::START_PIPELINE

esphome::voice_assistant::VoiceAssistant::read_microphone_
int read_microphone_()
Definition: voice_assistant.cpp:115

esphome::microphone::Microphone::stop
virtual void stop()=0

esphome::voice_assistant::VoiceAssistant::state_
State state_
Definition: voice_assistant.h:226

esphome::voice_assistant::VoiceAssistant::stt_vad_start_trigger_
Trigger * stt_vad_start_trigger_
Definition: voice_assistant.h:167

esphome::shelly_dimmer::flags
const uint32_t flags
Definition: stm32flash.h:85

esphome::voice_assistant::VoiceAssistant::volume_multiplier_
float volume_multiplier_
Definition: voice_assistant.h:218

esphome::voice_assistant::VoiceAssistant::ring_buffer_
std::unique_ptr< RingBuffer > ring_buffer_
Definition: voice_assistant.h:213

esphome::api::enums::VOICE_ASSISTANT_RUN_END
Definition: api_pb2.h:180

esphome::api::APIServerConnectionBase::send_voice_assistant_audio
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
Definition: api_pb2_service.cpp:480

esphome::microphone::Microphone::is_stopped
bool is_stopped() const
Definition: microphone.h:26

esphome::api::enums::VOICE_ASSISTANT_TTS_END
Definition: api_pb2.h:186

esphome::voice_assistant::VoiceAssistant::on_audio
void on_audio(const api::VoiceAssistantAudio &msg)
Definition: voice_assistant.cpp:731

esphome::voice_assistant::VoiceAssistant::start_streaming
void start_streaming()
Definition: voice_assistant.cpp:454

esphome::voice_assistant::VoiceAssistant::client_disconnected_trigger_
Trigger * client_disconnected_trigger_
Definition: voice_assistant.h:181

esphome::voice_assistant::VoiceAssistant::stt_vad_end_trigger_
Trigger * stt_vad_end_trigger_
Definition: voice_assistant.h:168

esphome::voice_assistant::VoiceAssistant::dest_addr_
struct sockaddr_storage dest_addr_
Definition: voice_assistant.h:160

esphome::voice_assistant::VoiceAssistant::stream_ended_
bool stream_ended_
Definition: voice_assistant.h:194

esphome::microphone::Microphone::read
virtual size_t read(int16_t *buf, size_t len)=0

esphome::voice_assistant::VoiceAssistant::wake_word_detected_trigger_
Trigger * wake_word_detected_trigger_
Definition: voice_assistant.h:173

esphome::media_player::MediaPlayerCall::perform
void perform()
Definition: media_player.cpp:58

esphome::voice_assistant::VoiceAssistant::loop
void loop() override
Definition: voice_assistant.cpp:131

esphome::ExternalRAMAllocator::allocate
T * allocate(size_t n)
Definition: helpers.h:659

esphome::voice_assistant::VoiceAssistant::tts_stream_end_trigger_
Trigger * tts_stream_end_trigger_
Definition: voice_assistant.h:171

esphome::voice_assistant::VoiceAssistant::silence_detection_
bool silence_detection_
Definition: voice_assistant.h:224

esphome::speaker::Speaker::start
virtual void start()=0

esphome::api::VoiceAssistantRequest::start
bool start
Definition: api_pb2.h:1707

esphome::Component::mark_failed
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118

esphome::api::VoiceAssistantRequest
Definition: api_pb2.h:1705

esphome::voice_assistant::VoiceAssistant::request_stop
void request_stop()
Definition: voice_assistant.cpp:522

esphome::voice_assistant::VoiceAssistant::idle_trigger_
Trigger * idle_trigger_
Definition: voice_assistant.h:178

esphome
This is a workaround until we can figure out a way to get the tflite-micro idf component code availab...
Definition: a01nyub.cpp:7

sockaddr_in6
Definition: headers.h:72

esphome::api::enums::VOICE_ASSISTANT_TTS_STREAM_START
Definition: api_pb2.h:191

esphome::voice_assistant::VoiceAssistant::vad_threshold_
uint8_t vad_threshold_
Definition: voice_assistant.h:210

esphome::voice_assistant::VoiceAssistant::speaker_buffer_size_
size_t speaker_buffer_size_
Definition: voice_assistant.h:191

esphome::voice_assistant::VoiceAssistant::intent_start_trigger_
Trigger * intent_start_trigger_
Definition: voice_assistant.h:163

esphome::api::enums::VOICE_ASSISTANT_TTS_STREAM_END
Definition: api_pb2.h:192

log.h

esphome::api::enums::VOICE_ASSISTANT_STT_VAD_END
Definition: api_pb2.h:190

esphome::voice_assistant::State::STARTING_PIPELINE

esphome::voice_assistant::State::STARTING_MICROPHONE

esphome::voice_assistant::State::IDLE

esphome::RingBuffer::create
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:14

esphome::media_player::MediaPlayerCall::set_media_url
MediaPlayerCall & set_media_url(const std::string &url)
Definition: media_player.cpp:101

esphome::voice_assistant::State::START_MICROPHONE

esphome::api::VoiceAssistantAudioSettings::volume_multiplier
float volume_multiplier
Definition: api_pb2.h:1695

esphome::voice_assistant::VoiceAssistant::vad_counter_
uint8_t vad_counter_
Definition: voice_assistant.h:211

esphome::api::VoiceAssistantAudio::data
std::string data
Definition: api_pb2.h:1760

esphome::api::enums::VOICE_ASSISTANT_STT_END
Definition: api_pb2.h:182

esphome::voice_assistant::VoiceAssistant::speaker_buffer_index_
size_t speaker_buffer_index_
Definition: voice_assistant.h:190

esphome::speaker::Speaker::stop
virtual void stop()=0

esphome::api::APIConnection
Definition: api_connection.h:16

esphome::voice_assistant::VoiceAssistant::setup
void setup() override
Definition: voice_assistant.cpp:70

esphome::api::VoiceAssistantAudio
Definition: api_pb2.h:1758

esphome::api::enums::VOICE_ASSISTANT_ERROR
Definition: api_pb2.h:178

esphome::voice_assistant::VoiceAssistant::on_event
void on_event(const api::VoiceAssistantEventResponse &msg)
Definition: voice_assistant.cpp:562

state
bool state
Definition: fan.h:34

esphome::api::enums::VOICE_ASSISTANT_WAKE_WORD_END
Definition: api_pb2.h:188

esphome::voice_assistant::VoiceAssistant::stt_end_trigger_
Trigger< std::string > * stt_end_trigger_
Definition: voice_assistant.h:174

esphome::voice_assistant::VoiceAssistant::request_start
void request_start(bool continuous, bool silence_detection)
Definition: voice_assistant.cpp:500

esphome::voice_assistant::VoiceAssistant::error_trigger_
Trigger< std::string, std::string > * error_trigger_
Definition: voice_assistant.h:177

esphome::socket::socket
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
Definition: bsd_sockets_impl.cpp:133

esphome::voice_assistant::VoiceAssistant::signal_stop_
void signal_stop_()
Definition: voice_assistant.cpp:551