@@ -408,6 +408,11 @@ def add_frames(self, frame_np):
408
408
if self .frames_np is not None and self .frames_np .shape [0 ] > 45 * self .RATE :
409
409
self .frames_offset += 30.0
410
410
self .frames_np = self .frames_np [int (30 * self .RATE ):]
411
+ # check timestamp offset(should be >= self.frame_offset)
412
+ # this basically means that there is no speech as timestamp offset hasnt updated
413
+ # and is less than frame_offset
414
+ if self .timestamp_offset < self .frames_offset :
415
+ self .timestamp_offset = self .frames_offset
411
416
if self .frames_np is None :
412
417
self .frames_np = frame_np .copy ()
413
418
else :
@@ -796,7 +801,8 @@ def transcribe_audio(self, input_sample):
796
801
task = self .task ,
797
802
vad_filter = self .use_vad ,
798
803
vad_parameters = self .vad_parameters if self .use_vad else None )
799
- if self .language is None :
804
+
805
+ if self .language is None and info is not None :
800
806
self .set_language (info )
801
807
return result
802
808
@@ -881,7 +887,9 @@ def speech_to_text(self):
881
887
input_sample = input_bytes .copy ()
882
888
result = self .transcribe_audio (input_sample )
883
889
884
- if self .language is None :
890
+ if result is None or self .language is None :
891
+ self .timestamp_offset += duration
892
+ time .sleep (0.25 ) # wait for voice activity, result is None when no voice activity
885
893
continue
886
894
self .handle_transcription_output (result , duration )
887
895
@@ -932,7 +940,6 @@ def update_segments(self, segments, duration):
932
940
"""
933
941
offset = None
934
942
self .current_out = ''
935
- last_segment = None
936
943
# process complete segments
937
944
if len (segments ) > 1 :
938
945
for i , s in enumerate (segments [:- 1 ]):
0 commit comments