ESPHome  2024.6.6
voice_assistant.cpp
Go to the documentation of this file.
1 #include "voice_assistant.h"
2 
3 #ifdef USE_VOICE_ASSISTANT
4 
5 #include "esphome/core/log.h"
6 
7 #include <cinttypes>
8 #include <cstdio>
9 
10 namespace esphome {
11 namespace voice_assistant {
12 
13 static const char *const TAG = "voice_assistant";
14 
15 #ifdef SAMPLE_RATE_HZ
16 #undef SAMPLE_RATE_HZ
17 #endif
18 
19 static const size_t SAMPLE_RATE_HZ = 16000;
20 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
21 static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
22 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
23 static const size_t RECEIVE_SIZE = 1024;
24 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
25 
27 
29  this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
30  if (this->socket_ == nullptr) {
31  ESP_LOGE(TAG, "Could not create socket");
32  this->mark_failed();
33  return false;
34  }
35  int enable = 1;
36  int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
37  if (err != 0) {
38  ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
39  // we can still continue
40  }
41  err = this->socket_->setblocking(false);
42  if (err != 0) {
43  ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
44  this->mark_failed();
45  return false;
46  }
47 
48 #ifdef USE_SPEAKER
49  if (this->speaker_ != nullptr) {
50  struct sockaddr_storage server;
51 
52  socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
53  if (sl == 0) {
54  ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
55  this->mark_failed();
56  return false;
57  }
58 
59  err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
60  if (err != 0) {
61  ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
62  this->mark_failed();
63  return false;
64  }
65  }
66 #endif
67  this->udp_socket_running_ = true;
68  return true;
69 }
70 
72  ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
73 
75 }
76 
78  if (this->send_buffer_ != nullptr) {
79  return true; // Already allocated
80  }
81 
82 #ifdef USE_SPEAKER
83  if (this->speaker_ != nullptr) {
85  this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
86  if (this->speaker_buffer_ == nullptr) {
87  ESP_LOGW(TAG, "Could not allocate speaker buffer");
88  return false;
89  }
90  }
91 #endif
92 
94  this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
95  if (this->input_buffer_ == nullptr) {
96  ESP_LOGW(TAG, "Could not allocate input buffer");
97  return false;
98  }
99 
100 #ifdef USE_ESP_ADF
101  this->vad_instance_ = vad_create(VAD_MODE_4);
102 #endif
103 
104  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
105  if (this->ring_buffer_ == nullptr) {
106  ESP_LOGW(TAG, "Could not allocate ring buffer");
107  return false;
108  }
109 
111  this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
112  if (send_buffer_ == nullptr) {
113  ESP_LOGW(TAG, "Could not allocate send buffer");
114  return false;
115  }
116 
117  return true;
118 }
119 
121  if (this->send_buffer_ != nullptr) {
122  memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
123  }
124 
125  if (this->input_buffer_ != nullptr) {
126  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
127  }
128 
129  if (this->ring_buffer_ != nullptr) {
130  this->ring_buffer_->reset();
131  }
132 
133 #ifdef USE_SPEAKER
134  if (this->speaker_buffer_ != nullptr) {
135  memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
136 
137  this->speaker_buffer_size_ = 0;
138  this->speaker_buffer_index_ = 0;
139  this->speaker_bytes_received_ = 0;
140  }
141 #endif
142 }
143 
146  send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
147  this->send_buffer_ = nullptr;
148 
149  if (this->ring_buffer_ != nullptr) {
150  this->ring_buffer_.reset();
151  this->ring_buffer_ = nullptr;
152  }
153 
154 #ifdef USE_ESP_ADF
155  if (this->vad_instance_ != nullptr) {
156  vad_destroy(this->vad_instance_);
157  this->vad_instance_ = nullptr;
158  }
159 #endif
160 
162  input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
163  this->input_buffer_ = nullptr;
164 
165 #ifdef USE_SPEAKER
166  if (this->speaker_buffer_ != nullptr) {
168  speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
169  this->speaker_buffer_ = nullptr;
170  }
171 #endif
172 }
173 
175  size_t bytes_read = 0;
176  if (this->mic_->is_running()) { // Read audio into input buffer
177  bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
178  if (bytes_read == 0) {
179  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
180  return 0;
181  }
182  // Write audio into ring buffer
183  this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
184  } else {
185  ESP_LOGD(TAG, "microphone not running");
186  }
187  return bytes_read;
188 }
189 
191  if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
193  if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
195  } else {
197  }
198  this->continuous_ = false;
199  this->signal_stop_();
200  this->clear_buffers_();
201  return;
202  }
203  switch (this->state_) {
204  case State::IDLE: {
205  if (this->continuous_ && this->desired_state_ == State::IDLE) {
206  this->idle_trigger_->trigger();
207 #ifdef USE_ESP_ADF
208  if (this->use_wake_word_) {
210  } else
211 #endif
212  {
214  }
215  } else {
216  this->high_freq_.stop();
217  }
218  break;
219  }
221  ESP_LOGD(TAG, "Starting Microphone");
222  if (!this->allocate_buffers_()) {
223  this->status_set_error("Failed to allocate buffers");
224  return;
225  }
226  if (this->status_has_error()) {
227  this->status_clear_error();
228  }
229  this->clear_buffers_();
230 
231  this->mic_->start();
232  this->high_freq_.start();
234  break;
235  }
237  if (this->mic_->is_running()) {
238  this->set_state_(this->desired_state_);
239  }
240  break;
241  }
242 #ifdef USE_ESP_ADF
243  case State::WAIT_FOR_VAD: {
244  this->read_microphone_();
245  ESP_LOGD(TAG, "Waiting for speech...");
247  break;
248  }
249  case State::WAITING_FOR_VAD: {
250  size_t bytes_read = this->read_microphone_();
251  if (bytes_read > 0) {
252  vad_state_t vad_state =
253  vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
254  if (vad_state == VAD_SPEECH) {
255  if (this->vad_counter_ < this->vad_threshold_) {
256  this->vad_counter_++;
257  } else {
258  ESP_LOGD(TAG, "VAD detected speech");
260 
261  // Reset for next time
262  this->vad_counter_ = 0;
263  }
264  } else {
265  if (this->vad_counter_ > 0) {
266  this->vad_counter_--;
267  }
268  }
269  }
270  break;
271  }
272 #endif
273  case State::START_PIPELINE: {
274  this->read_microphone_();
275  ESP_LOGD(TAG, "Requesting start...");
276  uint32_t flags = 0;
277  if (this->use_wake_word_)
279  if (this->silence_detection_)
281  api::VoiceAssistantAudioSettings audio_settings;
282  audio_settings.noise_suppression_level = this->noise_suppression_level_;
283  audio_settings.auto_gain = this->auto_gain_;
284  audio_settings.volume_multiplier = this->volume_multiplier_;
285 
287  msg.start = true;
288  msg.conversation_id = this->conversation_id_;
289  msg.flags = flags;
290  msg.audio_settings = audio_settings;
291  msg.wake_word_phrase = this->wake_word_;
292  this->wake_word_ = "";
293 
294  if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) {
295  ESP_LOGW(TAG, "Could not request start");
296  this->error_trigger_->trigger("not-connected", "Could not request start");
297  this->continuous_ = false;
299  break;
300  }
302  this->set_timeout("reset-conversation_id", 5 * 60 * 1000, [this]() { this->conversation_id_ = ""; });
303  break;
304  }
306  this->read_microphone_();
307  break; // State changed when udp server port received
308  }
310  this->read_microphone_();
311  size_t available = this->ring_buffer_->available();
312  while (available >= SEND_BUFFER_SIZE) {
313  size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
314  if (this->audio_mode_ == AUDIO_MODE_API) {
316  msg.data.assign((char *) this->send_buffer_, read_bytes);
318  } else {
319  if (!this->udp_socket_running_) {
320  if (!this->start_udp_socket_()) {
322  break;
323  }
324  }
325  this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
326  sizeof(this->dest_addr_));
327  }
328  available = this->ring_buffer_->available();
329  }
330 
331  break;
332  }
333  case State::STOP_MICROPHONE: {
334  if (this->mic_->is_running()) {
335  this->mic_->stop();
337  } else {
338  this->set_state_(this->desired_state_);
339  }
340  break;
341  }
343  if (this->mic_->is_stopped()) {
344  this->set_state_(this->desired_state_);
345  }
346  break;
347  }
349  break; // State changed by events
350  }
352  bool playing = false;
353 #ifdef USE_SPEAKER
354  if (this->speaker_ != nullptr) {
355  ssize_t received_len = 0;
356  if (this->audio_mode_ == AUDIO_MODE_UDP) {
357  if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
358  received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
359  if (received_len > 0) {
360  this->speaker_buffer_index_ += received_len;
361  this->speaker_buffer_size_ += received_len;
362  this->speaker_bytes_received_ += received_len;
363  }
364  } else {
365  ESP_LOGD(TAG, "Receive buffer full");
366  }
367  }
368  // Build a small buffer of audio before sending to the speaker
369  bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
370  if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
371  this->write_speaker_();
372  if (this->wait_for_stream_end_) {
373  this->cancel_timeout("playing");
374  if (end_of_stream) {
375  ESP_LOGD(TAG, "End of audio stream received");
376  this->cancel_timeout("speaker-timeout");
378  }
379  break; // We dont want to timeout here as the STREAM_END event will take care of that.
380  }
381  playing = this->speaker_->is_running();
382  }
383 #endif
384 #ifdef USE_MEDIA_PLAYER
385  if (this->media_player_ != nullptr) {
387  }
388 #endif
389  if (playing) {
390  this->set_timeout("playing", 2000, [this]() {
391  this->cancel_timeout("speaker-timeout");
393  });
394  }
395  break;
396  }
398 #ifdef USE_SPEAKER
399  if (this->speaker_ != nullptr) {
400  if (this->speaker_buffer_size_ > 0) {
401  this->write_speaker_();
402  break;
403  }
404  if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
405  break;
406  }
407  ESP_LOGD(TAG, "Speaker has finished outputting all audio");
408  this->speaker_->stop();
409  this->cancel_timeout("speaker-timeout");
410  this->cancel_timeout("playing");
411 
412  this->clear_buffers_();
413 
414  this->wait_for_stream_end_ = false;
415  this->stream_ended_ = false;
416 
418  }
419 #endif
421  break;
422  }
423  default:
424  break;
425  }
426 }
427 
428 #ifdef USE_SPEAKER
430  if (this->speaker_buffer_size_ > 0) {
431  size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
432  size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
433  if (written > 0) {
434  memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
435  this->speaker_buffer_size_ -= written;
436  this->speaker_buffer_index_ -= written;
437  this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
438  } else {
439  ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
440  }
441  }
442 }
443 #endif
444 
446  if (!subscribe) {
447  if (this->api_client_ == nullptr || client != this->api_client_) {
448  ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
449  return;
450  }
451  this->api_client_ = nullptr;
453  return;
454  }
455 
456  if (this->api_client_ != nullptr) {
457  ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
458  ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
459  ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
460  return;
461  }
462 
463  this->api_client_ = client;
465 }
466 
467 static const LogString *voice_assistant_state_to_string(State state) {
468  switch (state) {
469  case State::IDLE:
470  return LOG_STR("IDLE");
472  return LOG_STR("START_MICROPHONE");
474  return LOG_STR("STARTING_MICROPHONE");
475  case State::WAIT_FOR_VAD:
476  return LOG_STR("WAIT_FOR_VAD");
478  return LOG_STR("WAITING_FOR_VAD");
480  return LOG_STR("START_PIPELINE");
482  return LOG_STR("STARTING_PIPELINE");
484  return LOG_STR("STREAMING_MICROPHONE");
486  return LOG_STR("STOP_MICROPHONE");
488  return LOG_STR("STOPPING_MICROPHONE");
490  return LOG_STR("AWAITING_RESPONSE");
492  return LOG_STR("STREAMING_RESPONSE");
494  return LOG_STR("RESPONSE_FINISHED");
495  default:
496  return LOG_STR("UNKNOWN");
497  }
498 };
499 
501  State old_state = this->state_;
502  this->state_ = state;
503  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
504  LOG_STR_ARG(voice_assistant_state_to_string(state)));
505 }
506 
507 void VoiceAssistant::set_state_(State state, State desired_state) {
508  this->set_state_(state);
509  this->desired_state_ = desired_state;
510  ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
511 }
512 
514  ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
515  this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
517 }
518 
520  if (this->state_ != State::STARTING_PIPELINE) {
521  this->signal_stop_();
522  return;
523  }
524 
525  ESP_LOGD(TAG, "Client started, streaming microphone");
526  this->audio_mode_ = AUDIO_MODE_API;
527 
528  if (this->mic_->is_running()) {
530  } else {
532  }
533 }
534 
535 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
536  if (this->state_ != State::STARTING_PIPELINE) {
537  this->signal_stop_();
538  return;
539  }
540 
541  ESP_LOGD(TAG, "Client started, streaming microphone");
542  this->audio_mode_ = AUDIO_MODE_UDP;
543 
544  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
545  if (this->dest_addr_.ss_family == AF_INET) {
546  ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
547  }
548 #if LWIP_IPV6
549  else if (this->dest_addr_.ss_family == AF_INET6) {
550  ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
551  }
552 #endif
553  else {
554  ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
555  return;
556  }
557 
558  if (this->mic_->is_running()) {
560  } else {
562  }
563 }
564 
565 void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
566  if (this->api_client_ == nullptr) {
567  ESP_LOGE(TAG, "No API client connected");
569  this->continuous_ = false;
570  return;
571  }
572  if (this->state_ == State::IDLE) {
573  this->continuous_ = continuous;
574  this->silence_detection_ = silence_detection;
575 #ifdef USE_ESP_ADF
576  if (this->use_wake_word_) {
578  } else
579 #endif
580  {
582  }
583  }
584 }
585 
587  this->continuous_ = false;
588 
589  switch (this->state_) {
590  case State::IDLE:
591  break;
594  case State::WAIT_FOR_VAD:
598  break;
601  this->signal_stop_();
603  break;
606  this->desired_state_ = State::IDLE;
607  break;
611  break; // Let the incoming audio stream finish then it will go to idle.
612  }
613 }
614 
616  memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
617  if (this->api_client_ == nullptr) {
618  return;
619  }
620  ESP_LOGD(TAG, "Signaling stop...");
622  msg.start = false;
624 }
625 
627  ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
628  switch (msg.event_type) {
630  ESP_LOGD(TAG, "Assist Pipeline running");
631  this->defer([this]() { this->start_trigger_->trigger(); });
632  break;
634  break;
636  ESP_LOGD(TAG, "Wake word detected");
637  this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
638  break;
639  }
641  ESP_LOGD(TAG, "STT started");
642  this->defer([this]() { this->listening_trigger_->trigger(); });
643  break;
645  std::string text;
646  for (auto arg : msg.data) {
647  if (arg.name == "text") {
648  text = std::move(arg.value);
649  }
650  }
651  if (text.empty()) {
652  ESP_LOGW(TAG, "No text in STT_END event");
653  return;
654  }
655  ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
656  this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
657  break;
658  }
660  ESP_LOGD(TAG, "Intent started");
661  this->defer([this]() { this->intent_start_trigger_->trigger(); });
662  break;
664  for (auto arg : msg.data) {
665  if (arg.name == "conversation_id") {
666  this->conversation_id_ = std::move(arg.value);
667  }
668  }
669  this->defer([this]() { this->intent_end_trigger_->trigger(); });
670  break;
671  }
673  std::string text;
674  for (auto arg : msg.data) {
675  if (arg.name == "text") {
676  text = std::move(arg.value);
677  }
678  }
679  if (text.empty()) {
680  ESP_LOGW(TAG, "No text in TTS_START event");
681  return;
682  }
683  ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
684  this->defer([this, text]() {
685  this->tts_start_trigger_->trigger(text);
686 #ifdef USE_SPEAKER
687  this->speaker_->start();
688 #endif
689  });
690  break;
691  }
693  std::string url;
694  for (auto arg : msg.data) {
695  if (arg.name == "url") {
696  url = std::move(arg.value);
697  }
698  }
699  if (url.empty()) {
700  ESP_LOGW(TAG, "No url in TTS_END event");
701  return;
702  }
703  ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
704  this->defer([this, url]() {
705 #ifdef USE_MEDIA_PLAYER
706  if (this->media_player_ != nullptr) {
708  }
709 #endif
710  this->tts_end_trigger_->trigger(url);
711  });
713  this->set_state_(new_state, new_state);
714  break;
715  }
717  ESP_LOGD(TAG, "Assist Pipeline ended");
718  if (this->state_ == State::STREAMING_MICROPHONE) {
719  this->ring_buffer_->reset();
720 #ifdef USE_ESP_ADF
721  if (this->use_wake_word_) {
722  // No need to stop the microphone since we didn't use the speaker
724  } else
725 #endif
726  {
728  }
729  } else if (this->state_ == State::AWAITING_RESPONSE) {
730  // No TTS start event ("nevermind")
732  }
733  this->defer([this]() { this->end_trigger_->trigger(); });
734  break;
735  }
737  std::string code = "";
738  std::string message = "";
739  for (auto arg : msg.data) {
740  if (arg.name == "code") {
741  code = std::move(arg.value);
742  } else if (arg.name == "message") {
743  message = std::move(arg.value);
744  }
745  }
746  if (code == "wake-word-timeout" || code == "wake_word_detection_aborted") {
747  // Don't change state here since either the "tts-end" or "run-end" events will do it.
748  return;
749  } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
750  // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
751  this->defer([this, code, message]() {
752  this->request_stop();
753  this->error_trigger_->trigger(code, message);
754  });
755  return;
756  }
757  ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
758  if (this->state_ != State::IDLE) {
759  this->signal_stop_();
761  }
762  this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
763  break;
764  }
766 #ifdef USE_SPEAKER
767  this->wait_for_stream_end_ = true;
768  ESP_LOGD(TAG, "TTS stream start");
769  this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
770 #endif
771  break;
772  }
774 #ifdef USE_SPEAKER
775  this->stream_ended_ = true;
776  ESP_LOGD(TAG, "TTS stream end");
777 #endif
778  break;
779  }
781  ESP_LOGD(TAG, "Starting STT by VAD");
782  this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
783  break;
785  ESP_LOGD(TAG, "STT by VAD end");
787  this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
788  break;
789  default:
790  ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
791  break;
792  }
793 }
794 
796 #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
797  if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
798  memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
799  this->speaker_buffer_index_ += msg.data.length();
800  this->speaker_buffer_size_ += msg.data.length();
801  this->speaker_bytes_received_ += msg.data.length();
802  ESP_LOGV(TAG, "Received audio: %" PRId32 " bytes from API", msg.data.length());
803  } else {
804  ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
805  }
806 #endif
807 }
808 
810  Timer timer = {
811  .id = msg.timer_id,
812  .name = msg.name,
813  .total_seconds = msg.total_seconds,
814  .seconds_left = msg.seconds_left,
815  .is_active = msg.is_active,
816  };
817  this->timers_[timer.id] = timer;
818  ESP_LOGD(TAG, "Timer Event");
819  ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
820  ESP_LOGD(TAG, " %s", timer.to_string().c_str());
821 
822  switch (msg.event_type) {
824  this->timer_started_trigger_->trigger(timer);
825  break;
827  this->timer_updated_trigger_->trigger(timer);
828  break;
830  this->timer_cancelled_trigger_->trigger(timer);
831  this->timers_.erase(timer.id);
832  break;
834  this->timer_finished_trigger_->trigger(timer);
835  this->timers_.erase(timer.id);
836  break;
837  }
838 
839  if (this->timers_.empty()) {
840  this->cancel_interval("timer-event");
841  this->timer_tick_running_ = false;
842  } else if (!this->timer_tick_running_) {
843  this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
844  this->timer_tick_running_ = true;
845  }
846 }
847 
849  std::vector<Timer> res;
850  res.reserve(this->timers_.size());
851  for (auto &pair : this->timers_) {
852  auto &timer = pair.second;
853  if (timer.is_active && timer.seconds_left > 0) {
854  timer.seconds_left--;
855  }
856  res.push_back(timer);
857  }
858  this->timer_tick_trigger_->trigger(res);
859 }
860 
861 VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
862 
863 } // namespace voice_assistant
864 } // namespace esphome
865 
866 #endif // USE_VOICE_ASSISTANT
virtual size_t play(const uint8_t *data, size_t length)=0
bool is_running() const
Definition: speaker.h:23
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition: component.cpp:52
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27
std::unordered_map< std::string, Timer > timers_
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition: component.cpp:73
enums::VoiceAssistantTimerEvent event_type
Definition: api_pb2.h:1786
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition: socket.cpp:53
std::unique_ptr< socket::Socket > socket_
sa_family_t ss_family
Definition: headers.h:92
Trigger< std::string > * tts_start_trigger_
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition: component.cpp:69
bool cancel_interval(const std::string &name)
Cancel an interval function.
Definition: component.cpp:56
An STL allocator that uses SPI RAM.
Definition: helpers.h:651
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition: component.cpp:130
void deallocate(T *p, size_t n)
Definition: helpers.h:678
Trigger< std::string > * tts_end_trigger_
uint32_t socklen_t
Definition: headers.h:97
VoiceAssistantAudioSettings audio_settings
Definition: api_pb2.h:1723
Trigger< std::vector< Timer > > * timer_tick_trigger_
enums::VoiceAssistantEvent event_type
Definition: api_pb2.h:1760
void client_subscription(api::APIConnection *client, bool subscribe)
virtual bool has_buffered_data() const =0
std::vector< VoiceAssistantEventData > data
Definition: api_pb2.h:1761
std::string get_client_combined_info() const
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95
bool status_has_error() const
Definition: component.cpp:150
void status_set_error(const char *message="unspecified")
Definition: component.cpp:159
media_player::MediaPlayer * media_player_
void start()
Start running the loop continuously.
Definition: helpers.cpp:648
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:654
const uint32_t flags
Definition: stm32flash.h:85
std::unique_ptr< RingBuffer > ring_buffer_
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
void status_clear_error()
Definition: component.cpp:172
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
MediaPlayerCall & set_announcement(bool announce)
virtual size_t read(int16_t *buf, size_t len)=0
virtual void start()=0
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118
This is a workaround until we can figure out a way to get the tflite-micro idf component code availab...
Definition: a01nyub.cpp:7
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:14
MediaPlayerCall & set_media_url(const std::string &url)
virtual void stop()=0
void on_event(const api::VoiceAssistantEventResponse &msg)
bool state
Definition: fan.h:34
Trigger< std::string > * stt_end_trigger_
void request_start(bool continuous, bool silence_detection)
Trigger< std::string, std::string > * error_trigger_
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.