@@ -88,7 +88,12 @@ async def _add_error(self, error: Exception):
8888 def _transform_audio_buffer (
8989 self , buffer : list [bytes ], output_dtype : npt .DTypeLike
9090 ) -> npt .NDArray [np .int16 | np .float32 ]:
91- np_array = np .frombuffer (b"" .join (buffer ), dtype = np .int16 )
91+ combined_buffer = b"" .join (buffer )
92+ if len (combined_buffer ) % 2 != 0 :
93+ # np.int16 needs 2-byte alignment; pad odd-length chunks safely.
94+ combined_buffer += b"\x00 "
95+
96+ np_array = np .frombuffer (combined_buffer , dtype = np .int16 )
9297
9398 if output_dtype == np .int16 :
9499 return np_array
@@ -118,6 +123,7 @@ async def _stream_audio(
118123 first_byte_received = False
119124 buffer : list [bytes ] = []
120125 full_audio_data : list [bytes ] = []
126+ pending_byte = b""
121127
122128 async for chunk in self .tts_model .run (text , self .tts_settings ):
123129 if not first_byte_received :
@@ -128,15 +134,33 @@ async def _stream_audio(
128134 buffer .append (chunk )
129135 full_audio_data .append (chunk )
130136 if len (buffer ) >= self ._buffer_size :
131- audio_np = self ._transform_audio_buffer (buffer , self .tts_settings .dtype )
132- if self .tts_settings .transform_data :
133- audio_np = self .tts_settings .transform_data (audio_np )
134- await local_queue .put (
135- VoiceStreamEventAudio (data = audio_np )
136- ) # Use local queue
137+ combined = pending_byte + b"" .join (buffer )
138+ if len (combined ) % 2 != 0 :
139+ pending_byte = combined [- 1 :]
140+ combined = combined [:- 1 ]
141+ else :
142+ pending_byte = b""
143+
144+ if combined :
145+ audio_np = self ._transform_audio_buffer (
146+ [combined ], self .tts_settings .dtype
147+ )
148+ if self .tts_settings .transform_data :
149+ audio_np = self .tts_settings .transform_data (audio_np )
150+ await local_queue .put (
151+ VoiceStreamEventAudio (data = audio_np )
152+ ) # Use local queue
137153 buffer = []
138154 if buffer :
139- audio_np = self ._transform_audio_buffer (buffer , self .tts_settings .dtype )
155+ combined = pending_byte + b"" .join (buffer )
156+ else :
157+ combined = pending_byte
158+
159+ if combined :
160+ # Final flush: pad the remaining half sample if needed.
161+ if len (combined ) % 2 != 0 :
162+ combined += b"\x00 "
163+ audio_np = self ._transform_audio_buffer ([combined ], self .tts_settings .dtype )
140164 if self .tts_settings .transform_data :
141165 audio_np = self .tts_settings .transform_data (audio_np )
142166 await local_queue .put (VoiceStreamEventAudio (data = audio_np )) # Use local queue
0 commit comments