-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
116 lines (98 loc) · 3.97 KB
/
main.py
File metadata and controls
116 lines (98 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import pyaudio
from funasr import AutoModel
from loguru import logger
from silero_vad import get_speech_timestamps, load_silero_vad
from extension.qwen import qwen_stream_decision
from extension.tts import speak
SAMPLE_RATE = 16000
CHUNK_SIZE_CONFIG = [0, 10, 5]
ENCODER_LOOKBACK = 4
DECODER_LOOKBACK = 1
CHUNK_STRIDE = CHUNK_SIZE_CONFIG[1] * 960 # 9600 采样点 (600ms)
VAD_THRESHOLD = 0.3
class Assistant:
def __init__(self):
logger.info("正在初始化 Silero VAD (ONNX)...")
self.vad_model = load_silero_vad(onnx=True)
logger.info("正在初始化 FunASR (Streaming)...")
self.asr_model = AutoModel(model="paraformer-zh-streaming")
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_STRIDE,
)
self.asr_cache = {}
self.is_speaking = False
logger.info("已就绪。")
def run(self):
try:
txt = ""
while True:
# 1. 采集音频
raw_data = self.stream.read(CHUNK_STRIDE, exception_on_overflow=False)
audio_int16 = np.frombuffer(raw_data, dtype=np.int16)
audio_float32 = audio_int16.astype(np.float32) / 32768.0
# 2. VAD 判定
speech_ts = get_speech_timestamps(
audio_float32,
self.vad_model,
threshold=VAD_THRESHOLD,
sampling_rate=SAMPLE_RATE,
)
if len(speech_ts) > 0:
if not self.is_speaking:
logger.debug("[VAD]Speaking")
self.is_speaking = True
# 3. 执行流式 ASR (传入 float32 数组)
res = self.asr_model.generate(
input=audio_float32,
cache=self.asr_cache,
is_final=False,
chunk_size=CHUNK_SIZE_CONFIG,
encoder_chunk_look_back=ENCODER_LOOKBACK,
decoder_chunk_look_back=DECODER_LOOKBACK,
)
if res:
text = res[0].get("text", res[0].get("value", ""))
if isinstance(text, list):
text = text[0]
if text:
logger.debug(f"{text}")
txt += text
else:
if self.is_speaking:
# 4. 停止说话时,强制刷新最终结果
res = self.asr_model.generate(
input=audio_float32,
cache=self.asr_cache,
is_final=True,
chunk_size=CHUNK_SIZE_CONFIG,
encoder_chunk_look_back=ENCODER_LOOKBACK,
decoder_chunk_look_back=DECODER_LOOKBACK,
)
if res:
text = res[0].get("text", res[0].get("value", ""))
if isinstance(text, list):
text = text[0]
if text:
logger.debug(f"{text}")
txt += text
logger.debug(f"最终文本: {txt}")
speak(qwen_stream_decision(txt))
txt = ""
logger.debug(f"[VAD]Stopped")
self.asr_cache = {}
self.is_speaking = False
except KeyboardInterrupt:
self.stop()
def stop(self):
self.stream.stop_stream()
self.stream.close()
self.p.terminate()
if __name__ == "__main__":
assistant = Assistant()
assistant.run()