Forum

火山字幕上屏的规则
 
Notifications
Clear all

火山字幕上屏的规则

1 帖子
1 Users
0 Reactions
5 查看
帖子: 44
Admin Registered
Topic starter
(@lucia)
会员
已加入: 3 周 前
Share
 
You said:
把脚本里关于实时字幕和历史字幕上屏和追加的规则的部分写上注释:import sys import os import time import math import json import base64 import queue import asyncio import threading import pyaudio import websockets from PyQt6.QtWidgets import (QApplication, QMainWindow, QPushButton, QTextEdit, QVBoxLayout, QHBoxLayout, QWidget, QLabel, QDialog, QFormLayout, QFontComboBox, QSpinBox, QColorDialog, QFileDialog, QMenu, QSlider, QInputDialog, QSizeGrip, QComboBox) from PyQt6.QtCore import QThread, pyqtSignal, QTimer, Qt from PyQt6.QtGui import QTextCursor, QFont, QColor # —————– 配置参数 —————– ARK_API_KEY = os.environ.get(“ARK_API_KEY”, “032121df-fd92-4088-8d7f-8cf375b0a96d”) ENDPOINT_ID = “ep-20250225111541-dcznw” # 替换为你的实际 Endpoint ID URL = f”wss://ark-beta.cn-beijing.volces.com/api/v3/realtime?service=clasi&model={ENDPOINT_ID}” HEADERS = [(“Authorization”, f”Bearer {ARK_API_KEY}”)] # —————– 音频参数 —————– RATE = 16000 # 采样率(Hz) CHANNELS = 1 # 单声道 SAMPLE_WIDTH = 2 # 16位(2 字节) CHUNK_SIZE = int(RATE * 0.1) # 100ms 音频数据 def hex_to_rgb(hex_color): “””将 #RRGGBB 格式转为 (R, G, B) 元组””” hex_color = hex_color.lstrip(‘#’) return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) ############################################################################### # 对文字设置对话框:用于设置字幕的字体、字号和颜色 ############################################################################### class TextSettingsDialog(QDialog): def __init__(self, parent=None, mode=”rt”, current_settings=None): super().__init__(parent) self.setWindowTitle(“字幕设置”) self.mode = mode if current_settings is None: if mode == “rt”: current_settings = {“font_family”: “Arial”, “font_size”: 16, “color”: “#0000FF”} else: current_settings = {“font_family”: “Arial”, “font_size”: 18, “color”: “#000000”} self.current_settings = current_settings self.font_combo = QFontComboBox(self) self.font_combo.setCurrentFont(QFont(current_settings[“font_family”])) self.size_spin = QSpinBox(self) self.size_spin.setRange(8, 72) self.size_spin.setValue(current_settings[“font_size”]) self.color_button = QPushButton(“选择颜色”, self) self.color = QColor(current_settings[“color”]) self.color_button.setStyleSheet(f”background-color: {self.color.name()};”) self.color_button.clicked.connect(self.choose_color) form_layout = QFormLayout() form_layout.addRow(“字体:”, self.font_combo) form_layout.addRow(“字号:”, self.size_spin) form_layout.addRow(“颜色:”, self.color_button) self.ok_button = QPushButton(“确定”, self) self.cancel_button = QPushButton(“取消”, self) self.ok_button.clicked.connect(self.accept) self.cancel_button.clicked.connect(self.reject) button_layout = QHBoxLayout() button_layout.addStretch() button_layout.addWidget(self.ok_button) button_layout.addWidget(self.cancel_button) main_layout = QVBoxLayout() main_layout.addLayout(form_layout) main_layout.addLayout(button_layout) self.setLayout(main_layout) def choose_color(self): color = QColorDialog.getColor(initial=self.color, parent=self, title=”选择颜色”) if color.isValid(): self.color = color self.color_button.setStyleSheet(f”background-color: {self.color.name()};”) def get_settings(self): return { “font_family”: self.font_combo.currentFont().family(), “font_size”: self.size_spin.value(), “color”: self.color.name() } ############################################################################### # 背景设置对话框:用于选择字幕背景颜色及透明度(以百分比表示) ############################################################################### class BackgroundSettingsDialog(QDialog): def __init__(self, parent=None, current_color=”#000000″, current_transparency=80): “”” current_transparency: 0~100,0%表示完全不透明,100%表示完全透明 “”” super().__init__(parent) self.setWindowTitle(“背景设置”) self.current_color = QColor(current_color) self.current_transparency = current_transparency self.color_button = QPushButton(“选择背景颜色”, self) self.color_button.setStyleSheet(f”background-color: {self.current_color.name()};”) self.color_button.clicked.connect(self.choose_color) self.transparency_slider = QSlider(Qt.Orientation.Horizontal, self) self.transparency_slider.setRange(0, 100) self.transparency_slider.setValue(self.current_transparency) self.transparency_label = QLabel(f”透明度: {self.current_transparency}%”, self) self.transparency_slider.valueChanged.connect( lambda value: self.transparency_label.setText(f”透明度: {value}%”) ) form_layout = QFormLayout() form_layout.addRow(“背景颜色:”, self.color_button) form_layout.addRow(“背景透明度 (%):”, self.transparency_slider) form_layout.addRow(“”, self.transparency_label) self.ok_button = QPushButton(“确定”, self) self.cancel_button = QPushButton(“取消”, self) self.ok_button.clicked.connect(self.accept) self.cancel_button.clicked.connect(self.reject) button_layout = QHBoxLayout() button_layout.addWidget(self.ok_button) button_layout.addWidget(self.cancel_button) main_layout = QVBoxLayout() main_layout.addLayout(form_layout) main_layout.addLayout(button_layout) self.setLayout(main_layout) def choose_color(self): color = QColorDialog.getColor(initial=self.current_color, parent=self, title=”选择背景颜色”) if color.isValid(): self.current_color = color self.color_button.setStyleSheet(f”background-color: {self.current_color.name()};”) def get_settings(self): return { “bg_color”: self.current_color.name(), “bg_transparency”: self.transparency_slider.value() # 0~100 } ############################################################################### # 线程:音频录制(使用 RMS 计算音量,增加静音检测) ############################################################################### class AudioRecorder(threading.Thread): def __init__(self, audio_queue, update_volume_cb=None): super().__init__() self.audio_queue = audio_queue self.update_volume_cb = update_volume_cb self.pyaudio_instance = pyaudio.PyAudio() self.stream = None self.running = False self.last_active_time = time.time() self.volume_threshold = 300 def run(self): self.stream = self.pyaudio_instance.open( format=pyaudio.paInt16, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK_SIZE ) self.running = True while self.running: try: data = self.stream.read(CHUNK_SIZE, exception_on_overflow=False) try: self.audio_queue.put_nowait(data) except queue.Full: try: self.audio_queue.get_nowait() self.audio_queue.put_nowait(data) except Exception: pass samples = [int.from_bytes(data[i:i+2], byteorder=’little’, signed=True) for i in range(0, len(data), 2)] if samples: rms = math.sqrt(sum(s * s for s in samples) / len(samples)) if rms > self.volume_threshold: self.last_active_time = time.time() if self.update_volume_cb: self.update_volume_cb(min(rms / 32768.0, 1.0)) except Exception as e: print(“AudioRecorder error:”, e) self.stream.stop_stream() self.stream.close() self.pyaudio_instance.terminate() def stop(self): self.running = False ############################################################################### # 线程:WebSocket 发送与接收(处理音频数据发送和转录返回) ############################################################################### class WebSocketThread(QThread): subtitle_signal_orig = pyqtSignal(str) subtitle_signal_trans = pyqtSignal(str) def __init__(self, audio_queue, translation_direction=”en2zh”): super().__init__() self.audio_queue = audio_queue self.translation_direction = translation_direction self.running = True def run(self): asyncio.run(self.websocket_main()) async def websocket_main(self): try: async with websockets.connect(URL, additional_headers=dict(HEADERS)) as ws: print(“WebSocket connection established.”) if self.translation_direction == “en2zh”: source_language = “en” target_language = “zh” else: source_language = “zh” target_language = “en” session_update_msg = { “type”: “session.update”, “session”: { “input_audio_format”: “pcm16”, “modalities”: [“text”], “input_audio_translation”: { “source_language”: source_language, “target_language”: target_language, “add_vocab”: {“hot_word_list”: [], “glossary_list”: []} } } } print(“Sending session.update message:”) print(json.dumps(session_update_msg, indent=2)) await ws.send(json.dumps(session_update_msg)) print(“Waiting for session.* event from server…”) session_ready = False try: response = await asyncio.wait_for(ws.recv(), timeout=3.0) print(“Received raw message:”, response) event = json.loads(response) event_type = event.get(“type”) print(“Parsed event type:”, event_type) if event_type in (“session.created”, “session.updated”): print(“Session is ready, now proceed with audio commits.”) session_ready = True else: session_ready = True except asyncio.TimeoutError: print(“Did not receive session.* event within 3 seconds. Proceeding anyway.”) session_ready = True if session_ready: send_task = asyncio.create_task(self.send_audio(ws)) recv_task = asyncio.create_task(self.receive_messages(ws)) ping_task = asyncio.create_task(self.ping_loop(ws)) await asyncio.gather(send_task, recv_task, ping_task) except Exception as e: print(“WebSocket connection error:”, e) async def send_audio(self, ws): loop = asyncio.get_running_loop() while self.running: try: data = await loop.run_in_executor(None, self.audio_queue.get, 0.10) if not data: print(“Warning: Received empty audio data.”) continue encoded_data = base64.b64encode(data).decode(‘utf-8’) commit_msg = {“type”: “input_audio.commit”, “audio”: encoded_data} await ws.send(json.dumps(commit_msg)) except queue.Empty: print(“Queue is empty, retrying…”) await asyncio.sleep(0.10) except websockets.exceptions.ConnectionClosedError as e: print(“WebSocket connection closed during send:”, e) self.running = False break except Exception as e: print(“Unexpected error during send:”, e) self.running = False break try: await ws.send(json.dumps({“type”: “input_audio.done”})) except Exception: pass async def receive_messages(self, ws): while self.running: try: message = await ws.recv() print(“Received raw message:”, message) event = json.loads(message) print(“Parsed event:”, event) if event.get(“type”) == “response.input_audio_transcription.delta”: transcription = event.get(“delta”, “”) self.subtitle_signal_orig.emit(transcription) elif event.get(“type”) == “response.input_audio_translation.delta”: translation = event.get(“delta”, “”) self.subtitle_signal_trans.emit(translation) except websockets.exceptions.ConnectionClosedError as e: print(“WebSocket connection closed during receive:”, e) break except Exception as e: print(“WebSocket receive error:”, e) break async def ping_loop(self, ws): try: while self.running: await asyncio.sleep(30) if self.running: pong_waiter = await ws.ping() print(“Sent ping to server.”) try: await asyncio.wait_for(pong_waiter, timeout=5) print(“Received pong from server.”) except asyncio.TimeoutError: print(“Did not receive pong in time, connection may be lost.”) except websockets.exceptions.ConnectionClosed: print(“ping_loop: WebSocket connection closed.”) except Exception as e: print(“ping_loop: Unexpected error:”, e) def stop(self): self.running = False ############################################################################### # 历史字幕窗口:显示已定型的字幕记录(滚动、字号较大) ############################################################################### class HistoryWindow(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle(“历史字幕”) self.setGeometry(850, 100, 600, 600) # 默认历史字幕样式 self.current_history_font = “Arial” self.current_history_font_size = 18 self.current_history_color = “#000000” self.history_text = QTextEdit(self) self.history_text.setReadOnly(True) self.update_style({ “font_family”: self.current_history_font, “font_size”: self.current_history_font_size, “color”: self.current_history_color }) self.history_text.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu) self.history_text.customContextMenuRequested.connect(self.show_history_context_menu) layout = QVBoxLayout() layout.addWidget(self.history_text) container = QWidget() container.setLayout(layout) self.setCentralWidget(container) def append_history(self, segment): current = self.history_text.toPlainText() new_text = current + “\n\n” + segment if current else segment self.history_text.setPlainText(new_text) cursor = self.history_text.textCursor() cursor.movePosition(QTextCursor.MoveOperation.End) self.history_text.setTextCursor(cursor) def update_style(self, settings): style = f”font-family: {settings[‘font_family’]}; font-size: {settings[‘font_size’]}px; color: {settings[‘color’]};” self.history_text.setStyleSheet(style) self.current_history_font = settings[“font_family”] self.current_history_font_size = settings[“font_size”] self.current_history_color = settings[“color”] def show_history_context_menu(self, pos): menu = QMenu(self.history_text) settings_action = menu.addAction(“设置历史字幕”) save_action = menu.addAction(“保存为 TXT”) settings_action.triggered.connect(self.open_history_settings) save_action.triggered.connect(self.save_history_as_txt) menu.exec(self.history_text.mapToGlobal(pos)) def open_history_settings(self): dialog = TextSettingsDialog(self, mode=”history”, current_settings={ “font_family”: self.current_history_font, “font_size”: self.current_history_font_size, “color”: self.current_history_color }) if dialog.exec() == QDialog.DialogCode.Accepted: settings = dialog.get_settings() self.update_style(settings) def save_history_as_txt(self): filename, _ = QFileDialog.getSaveFileName(self, “保存历史字幕”, “”, “Text Files (*.txt);;All Files (*)”) if filename: with open(filename, “w”, encoding=”utf-8″) as f: f.write(self.history_text.toPlainText()) ############################################################################### # 字幕叠加窗口:用于在视频上叠加显示实时字幕 # 1. 实时字幕背景的透明度以百分比设置(0%为不透明,100%为完全透明) # 2. 支持窗口拖动和交互式拖动缩放(使用 QSizeGrip 可改变宽度和高度) # 3. 右键菜单中增加“设置实时字幕”和“设置背景”选项, # 分别修改实时字幕的字体、字号、颜色以及背景设置。 ############################################################################### class SubtitleOverlayWindow(QMainWindow): def __init__(self): super().__init__() # 设置窗口为无边框、置顶、工具窗口 self.setWindowFlags(Qt.WindowType.FramelessWindowHint | Qt.WindowType.WindowStaysOnTopHint | Qt.WindowType.Tool) # 允许窗口背景透明 self.setAttribute(Qt.WidgetAttribute.WA_TranslucentBackground) # 初始背景为黑色且完全不透明(默认不透明,可通过背景设置调整透明度) self.setGeometry(100, 100, 800, 200) # 初始最大宽度取窗口宽度 self.max_width = self.width() # 两个标签用于显示原文与译文(居中,多行显示) self.orig_label = QLabel(“”, self) self.orig_label.setAlignment(Qt.AlignmentFlag.AlignCenter) self.orig_label.setWordWrap(True) self.orig_label.setMaximumWidth(self.max_width) self.trans_label = QLabel(“”, self) self.trans_label.setAlignment(Qt.AlignmentFlag.AlignCenter) self.trans_label.setWordWrap(True) self.trans_label.setMaximumWidth(self.max_width) layout = QVBoxLayout() layout.addWidget(self.orig_label) layout.addWidget(self.trans_label) # 增加 QSizeGrip 用于交互式拖动缩放 self.size_grip = QSizeGrip(self) layout.addWidget(self.size_grip, 0, Qt.AlignmentFlag.AlignRight) container = QWidget(self) container.setLayout(layout) self.setCentralWidget(container) # 默认实时字幕样式及背景设置 self.rt_font_family = “Arial” self.rt_font_size = 24 self.rt_color = “#FFFFFF” self.bg_color = “#000000″ # 黑色背景 self.bg_transparency = 80 # 透明度百分比(默认80%,即有效透明度 = (100 – 80)/100 = 0.2) self.update_style() # 用于拖动窗口的偏移量 self.offset = None def update_style(self): rgb = hex_to_rgb(self.bg_color) effective_alpha = (100 – self.bg_transparency) / 100.0 bg_style = f”background-color: rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {effective_alpha});” text_style = f”font-family: {self.rt_font_family}; font-size: {self.rt_font_size}px; color: {self.rt_color};” style = text_style + bg_style self.orig_label.setStyleSheet(style) self.trans_label.setStyleSheet(style) def setSubtitles(self, orig, trans): self.orig_label.setText(orig) self.trans_label.setText(trans) # 实现窗口拖动 def mousePressEvent(self, event): if event.button() == Qt.MouseButton.LeftButton: self.offset = event.pos() def mouseMoveEvent(self, event): if event.buttons() & Qt.MouseButton.LeftButton and self.offset is not None: self.move(self.pos() + event.pos() – self.offset) def mouseReleaseEvent(self, event): self.offset = None # 右键菜单:设置实时字幕样式和背景设置 def contextMenuEvent(self, event): menu = QMenu(self) set_font_action = menu.addAction(“设置实时字幕”) set_bg_action = menu.addAction(“设置背景”) action = menu.exec(event.globalPos()) if action == set_font_action: self.open_rt_settings() elif action == set_bg_action: self.open_bg_settings() def open_rt_settings(self): dialog = TextSettingsDialog(self, mode=”rt”, current_settings={ “font_family”: self.rt_font_family, “font_size”: self.rt_font_size, “color”: self.rt_color }) if dialog.exec() == QDialog.DialogCode.Accepted: settings = dialog.get_settings() self.rt_font_family = settings[“font_family”] self.rt_font_size = settings[“font_size”] self.rt_color = settings[“color”] self.update_style() def open_bg_settings(self): dialog = BackgroundSettingsDialog(self, current_color=self.bg_color, current_transparency=self.bg_transparency) if dialog.exec() == QDialog.DialogCode.Accepted: settings = dialog.get_settings() self.bg_color = settings[“bg_color”] self.bg_transparency = settings[“bg_transparency”] self.update_style() def resizeEvent(self, event): self.max_width = self.width() self.orig_label.setMaximumWidth(self.max_width) self.trans_label.setMaximumWidth(self.max_width) super().resizeEvent(event) ############################################################################### # 控制面板窗口:用于控制录音、转写、翻译,并将字幕同步更新到叠加窗口与历史窗口 # 另外增加退出按钮,退出时关闭程序 ############################################################################### class ControlPanelWindow(QMainWindow): def __init__(self, subtitle_overlay, history_window): super().__init__() self.setWindowTitle(“控制面板”) self.setGeometry(900, 100, 300, 280) # 稍微调整高度以容纳新控件 self.subtitle_overlay = subtitle_overlay self.history_window = history_window self.translation_direction = “en2zh” # 默认英译中 self.current_orig_segment = “” self.current_trans_segment = “” self.orig_finalized = False self.trans_finalized = False self.audio_queue = queue.Queue(maxsize=10) self.audio_recorder = None self.ws_thread = None # 定时器用于确定段落结束,初始不设置间隔,由下拉菜单控制 self.segment_timer = QTimer(self) self.segment_timer.timeout.connect(self.finalize_segment) # 默认设置为2秒 self.segment_timer.setInterval(2000) # 控制按钮 self.start_button = QPushButton(“开始”, self) self.pause_button = QPushButton(“暂停”, self) self.pause_button.setEnabled(False) self.toggle_button = QPushButton(“EN → ZH”, self) self.exit_button = QPushButton(“退出”, self) self.start_button.clicked.connect(self.start_recording) self.pause_button.clicked.connect(self.pause_recording) self.toggle_button.clicked.connect(self.toggle_translation_direction) self.exit_button.clicked.connect(self.exit_program) # 新增:段落间隔设置选项(1秒、2秒、3秒) interval_layout = QHBoxLayout() interval_label = QLabel(“段落间隔:”) self.interval_combo = QComboBox(self) self.interval_combo.addItems([“1秒”, “2秒”, “3秒”]) self.interval_combo.setCurrentIndex(1) # 默认选中2秒 self.interval_combo.currentIndexChanged.connect(self.change_timer_interval) interval_layout.addWidget(interval_label) interval_layout.addWidget(self.interval_combo) layout = QVBoxLayout() layout.addWidget(self.start_button) layout.addWidget(self.pause_button) layout.addWidget(self.toggle_button) layout.addLayout(interval_layout) # 添加到布局中 layout.addWidget(self.exit_button) container = QWidget() container.setLayout(layout) self.setCentralWidget(container) def change_timer_interval(self): # 根据下拉菜单选择的索引(0对应1秒,1对应2秒,2对应3秒)设置定时器间隔 seconds = self.interval_combo.currentIndex() + 1 self.segment_timer.setInterval(seconds * 1000) print(f”段落间隔已设置为 {seconds} 秒”) def toggle_translation_direction(self): self.translation_direction = “zh2en” if self.translation_direction == “en2zh” else “en2zh” self.toggle_button.setText(“EN → ZH” if self.translation_direction == “en2zh” else “ZH → EN”) if self.audio_recorder or self.ws_thread: self.pause_recording() self.start_recording() def exit_program(self): QApplication.quit() def start_recording(self): if not self.audio_recorder: self.audio_recorder = AudioRecorder(self.audio_queue) self.audio_recorder.start() if not self.ws_thread: self.ws_thread = WebSocketThread(self.audio_queue, translation_direction=self.translation_direction) self.ws_thread.subtitle_signal_orig.connect(self.update_subtitle_orig) self.ws_thread.subtitle_signal_trans.connect(self.update_subtitle_trans) self.ws_thread.start() self.start_button.setEnabled(False) self.pause_button.setEnabled(True) def pause_recording(self): self.finalize_segment() if self.audio_recorder: self.audio_recorder.stop() self.audio_recorder = None if self.ws_thread: self.ws_thread.stop() self.ws_thread = None self.start_button.setEnabled(True) self.pause_button.setEnabled(False) def update_subtitle_orig(self, text): if self.orig_finalized: self.current_orig_segment = “” self.orig_finalized = False self.current_orig_segment += text self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment) self.segment_timer.start() def update_subtitle_trans(self, text): if self.trans_finalized: self.current_trans_segment = “” self.trans_finalized = False self.current_trans_segment += text self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment) self.segment_timer.start() def finalize_segment(self): if self.current_orig_segment or self.current_trans_segment: segment = f”{self.current_orig_segment}\n{self.current_trans_segment}” self.history_window.append_history(segment) self.orig_finalized = True self.trans_finalized = True self.segment_timer.stop() def main(): app = QApplication(sys.argv) history_window = HistoryWindow() history_window.show() subtitle_overlay = SubtitleOverlayWindow() subtitle_overlay.show() control_panel = ControlPanelWindow(subtitle_overlay, history_window) control_panel.show() sys.exit(app.exec()) if __name__ == “__main__”: main()
 
Reasoned about subtitle overlay behavior for 7 seconds

Mapping the process

I’m piecing together how real-time and historical subtitles are displayed. The “SubtitleOverlayWindow” and “HistoryWindow” are crucial, with “ControlPanelWindow” finalizing segments for history.

Finalizing the details

I’m thinking through adding inline Chinese comments in the script to describe real-time and historical subtitles. This involves highlighting functions like self.subtitle_overlay.setSubtitles and self.history_window.append_history.

以下是在原脚本的基础上,针对「实时字幕和历史字幕上屏和追加的规则」添加了详细的中文注释示例(重点标注在 SubtitleOverlayWindowHistoryWindow、以及 ControlPanelWindow 中实时字幕与历史字幕更新的逻辑)。请查看相应注释部分即可理解实时字幕与历史字幕的工作流程:

python
import sys
import os
import time
import math
import json
import base64
import queue
import asyncio
import threading

import pyaudio
import websockets

from PyQt6.QtWidgets import (QApplication, QMainWindow, QPushButton, QTextEdit,
QVBoxLayout, QHBoxLayout, QWidget, QLabel, QDialog,
QFormLayout, QFontComboBox, QSpinBox, QColorDialog,
QFileDialog, QMenu, QSlider, QInputDialog, QSizeGrip,
QComboBox)
from PyQt6.QtCore import QThread, pyqtSignal, QTimer, Qt
from PyQt6.QtGui import QTextCursor, QFont, QColor

# ----------------- 配置参数 -----------------
ARK_API_KEY = os.environ.get("ARK_API_KEY", "032121df-fd92-4088-8d7f-8cf375b0a96d")
ENDPOINT_ID = "ep-20250225111541-dcznw" # 替换为你的实际 Endpoint ID
URL = f"wss://ark-beta.cn-beijing.volces.com/api/v3/realtime?service=clasi&model={ENDPOINT_ID}"
HEADERS = [("Authorization", f"Bearer {ARK_API_KEY}")]

# ----------------- 音频参数 -----------------
RATE = 16000 # 采样率(Hz)
CHANNELS = 1 # 单声道
SAMPLE_WIDTH = 2 # 16位(2 字节)
CHUNK_SIZE = int(RATE * 0.1) # 100ms 音频数据

def hex_to_rgb(hex_color):
"""将 #RRGGBB 格式转为 (R, G, B) 元组"""
hex_color = hex_color.lstrip('#')
return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

###############################################################################
# 对文字设置对话框:用于设置字幕的字体、字号和颜色
###############################################################################
class TextSettingsDialog(QDialog):
def __init__(self, parent=None, mode="rt", current_settings=None):
super().__init__(parent)
self.setWindowTitle("字幕设置")
self.mode = mode
if current_settings is None:
if mode == "rt":
current_settings = {"font_family": "Arial", "font_size": 16, "color": "#0000FF"}
else:
current_settings = {"font_family": "Arial", "font_size": 18, "color": "#000000"}
self.current_settings = current_settings

self.font_combo = QFontComboBox(self)
self.font_combo.setCurrentFont(QFont(current_settings["font_family"]))

self.size_spin = QSpinBox(self)
self.size_spin.setRange(8, 72)
self.size_spin.setValue(current_settings["font_size"])

self.color_button = QPushButton("选择颜色", self)
self.color = QColor(current_settings["color"])
self.color_button.setStyleSheet(f"background-color: {self.color.name()};")
self.color_button.clicked.connect(self.choose_color)

form_layout = QFormLayout()
form_layout.addRow("字体:", self.font_combo)
form_layout.addRow("字号:", self.size_spin)
form_layout.addRow("颜色:", self.color_button)

self.ok_button = QPushButton("确定", self)
self.cancel_button = QPushButton("取消", self)
self.ok_button.clicked.connect(self.accept)
self.cancel_button.clicked.connect(self.reject)
button_layout = QHBoxLayout()
button_layout.addStretch()
button_layout.addWidget(self.ok_button)
button_layout.addWidget(self.cancel_button)

main_layout = QVBoxLayout()
main_layout.addLayout(form_layout)
main_layout.addLayout(button_layout)
self.setLayout(main_layout)

def choose_color(self):
color = QColorDialog.getColor(initial=self.color, parent=self, title="选择颜色")
if color.isValid():
self.color = color
self.color_button.setStyleSheet(f"background-color: {self.color.name()};")

def get_settings(self):
return {
"font_family": self.font_combo.currentFont().family(),
"font_size": self.size_spin.value(),
"color": self.color.name()
}

###############################################################################
# 背景设置对话框:用于选择字幕背景颜色及透明度(以百分比表示)
###############################################################################
class BackgroundSettingsDialog(QDialog):
def __init__(self, parent=None, current_color="#000000", current_transparency=80):
"""
current_transparency: 0~100,0%表示完全不透明,100%表示完全透明
"""

super().__init__(parent)
self.setWindowTitle("背景设置")
self.current_color = QColor(current_color)
self.current_transparency = current_transparency

self.color_button = QPushButton("选择背景颜色", self)
self.color_button.setStyleSheet(f"background-color: {self.current_color.name()};")
self.color_button.clicked.connect(self.choose_color)

self.transparency_slider = QSlider(Qt.Orientation.Horizontal, self)
self.transparency_slider.setRange(0, 100)
self.transparency_slider.setValue(self.current_transparency)
self.transparency_label = QLabel(f"透明度: {self.current_transparency}%", self)
self.transparency_slider.valueChanged.connect(
lambda value: self.transparency_label.setText(f"透明度: {value}%")
)

form_layout = QFormLayout()
form_layout.addRow("背景颜色:", self.color_button)
form_layout.addRow("背景透明度 (%):", self.transparency_slider)
form_layout.addRow("", self.transparency_label)

self.ok_button = QPushButton("确定", self)
self.cancel_button = QPushButton("取消", self)
self.ok_button.clicked.connect(self.accept)
self.cancel_button.clicked.connect(self.reject)
button_layout = QHBoxLayout()
button_layout.addWidget(self.ok_button)
button_layout.addWidget(self.cancel_button)

main_layout = QVBoxLayout()
main_layout.addLayout(form_layout)
main_layout.addLayout(button_layout)
self.setLayout(main_layout)

def choose_color(self):
color = QColorDialog.getColor(initial=self.current_color, parent=self, title="选择背景颜色")
if color.isValid():
self.current_color = color
self.color_button.setStyleSheet(f"background-color: {self.current_color.name()};")

def get_settings(self):
return {
"bg_color": self.current_color.name(),
"bg_transparency": self.transparency_slider.value() # 0~100
}

###############################################################################
# 线程:音频录制(使用 RMS 计算音量,增加静音检测)
###############################################################################
class AudioRecorder(threading.Thread):
def __init__(self, audio_queue, update_volume_cb=None):
super().__init__()
self.audio_queue = audio_queue
self.update_volume_cb = update_volume_cb
self.pyaudio_instance = pyaudio.PyAudio()
self.stream = None
self.running = False
self.last_active_time = time.time()
self.volume_threshold = 300

def run(self):
self.stream = self.pyaudio_instance.open(
format=pyaudio.paInt16,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK_SIZE
)
self.running = True
while self.running:
try:
data = self.stream.read(CHUNK_SIZE, exception_on_overflow=False)
try:
self.audio_queue.put_nowait(data)
except queue.Full:
try:
self.audio_queue.get_nowait()
self.audio_queue.put_nowait(data)
except Exception:
pass
samples = [int.from_bytes(data[i:i+2], byteorder='little', signed=True)
for i in range(0, len(data), 2)]
if samples:
rms = math.sqrt(sum(s * s for s in samples) / len(samples))
if rms > self.volume_threshold:
self.last_active_time = time.time()
if self.update_volume_cb:
self.update_volume_cb(min(rms / 32768.0, 1.0))
except Exception as e:
print("AudioRecorder error:", e)
self.stream.stop_stream()
self.stream.close()
self.pyaudio_instance.terminate()

def stop(self):
self.running = False

###############################################################################
# 线程:WebSocket 发送与接收(处理音频数据发送和转录返回)
###############################################################################
class WebSocketThread(QThread):
subtitle_signal_orig = pyqtSignal(str)
subtitle_signal_trans = pyqtSignal(str)

def __init__(self, audio_queue, translation_direction="en2zh"):
super().__init__()
self.audio_queue = audio_queue
self.translation_direction = translation_direction
self.running = True

def run(self):
asyncio.run(self.websocket_main())

async def websocket_main(self):
try:
async with websockets.connect(URL, additional_headers=dict(HEADERS)) as ws:
print("WebSocket connection established.")

if self.translation_direction == "en2zh":
source_language = "en"
target_language = "zh"
else:
source_language = "zh"
target_language = "en"

session_update_msg = {
"type": "session.update",
"session": {
"input_audio_format": "pcm16",
"modalities": ["text"],
"input_audio_translation": {
"source_language": source_language,
"target_language": target_language,
"add_vocab": {"hot_word_list": [], "glossary_list": []}
}
}
}
print("Sending session.update message:")
print(json.dumps(session_update_msg, indent=2))
await ws.send(json.dumps(session_update_msg))

print("Waiting for session.* event from server...")
session_ready = False
try:
response = await asyncio.wait_for(ws.recv(), timeout=3.0)
print("Received raw message:", response)
event = json.loads(response)
event_type = event.get("type")
print("Parsed event type:", event_type)
if event_type in ("session.created", "session.updated"):
print("Session is ready, now proceed with audio commits.")
session_ready = True
else:
session_ready = True
except asyncio.TimeoutError:
print("Did not receive session.* event within 3 seconds. Proceeding anyway.")
session_ready = True

if session_ready:
send_task = asyncio.create_task(self.send_audio(ws))
recv_task = asyncio.create_task(self.receive_messages(ws))
ping_task = asyncio.create_task(self.ping_loop(ws))
await asyncio.gather(send_task, recv_task, ping_task)

except Exception as e:
print("WebSocket connection error:", e)

async def send_audio(self, ws):
loop = asyncio.get_running_loop()
while self.running:
try:
data = await loop.run_in_executor(None, self.audio_queue.get, 0.10)
if not data:
print("Warning: Received empty audio data.")
continue
encoded_data = base64.b64encode(data).decode('utf-8')
commit_msg = {"type": "input_audio.commit", "audio": encoded_data}
await ws.send(json.dumps(commit_msg))
except queue.Empty:
print("Queue is empty, retrying...")
await asyncio.sleep(0.10)
except websockets.exceptions.ConnectionClosedError as e:
print("WebSocket connection closed during send:", e)
self.running = False
break
except Exception as e:
print("Unexpected error during send:", e)
self.running = False
break
try:
await ws.send(json.dumps({"type": "input_audio.done"}))
except Exception:
pass

async def receive_messages(self, ws):
while self.running:
try:
message = await ws.recv()
print("Received raw message:", message)
event = json.loads(message)
print("Parsed event:", event)
if event.get("type") == "response.input_audio_transcription.delta":
transcription = event.get("delta", "")
# 收到服务端发回的“实时原文字幕”增量时,通过信号发给主线程
self.subtitle_signal_orig.emit(transcription)
elif event.get("type") == "response.input_audio_translation.delta":
translation = event.get("delta", "")
# 收到服务端发回的“实时译文字幕”增量时,通过信号发给主线程
self.subtitle_signal_trans.emit(translation)
except websockets.exceptions.ConnectionClosedError as e:
print("WebSocket connection closed during receive:", e)
break
except Exception as e:
print("WebSocket receive error:", e)
break

async def ping_loop(self, ws):
try:
while self.running:
await asyncio.sleep(30)
if self.running:
pong_waiter = await ws.ping()
print("Sent ping to server.")
try:
await asyncio.wait_for(pong_waiter, timeout=5)
print("Received pong from server.")
except asyncio.TimeoutError:
print("Did not receive pong in time, connection may be lost.")
except websockets.exceptions.ConnectionClosed:
print("ping_loop: WebSocket connection closed.")
except Exception as e:
print("ping_loop: Unexpected error:", e)

def stop(self):
self.running = False

###############################################################################
# 历史字幕窗口:显示已定型的字幕记录(滚动、字号较大)
###############################################################################
class HistoryWindow(QMainWindow):
"""
历史字幕窗口:
1. 用于显示所有“已定型”的字幕文本(即当一个段落确定结束后追加到这里)。
2. 提供单独的样式设置,如字体、大小、颜色,并与实时字幕独立。
3. 支持右键菜单进行历史字幕的样式设置,或者导出为TXT文件。
"""

def __init__(self):
super().__init__()
self.setWindowTitle("历史字幕")
self.setGeometry(850, 100, 600, 600)
# 默认历史字幕样式
self.current_history_font = "Arial"
self.current_history_font_size = 18
self.current_history_color = "#000000"

self.history_text = QTextEdit(self)
self.history_text.setReadOnly(True)
self.update_style({
"font_family": self.current_history_font,
"font_size": self.current_history_font_size,
"color": self.current_history_color
})

self.history_text.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
self.history_text.customContextMenuRequested.connect(self.show_history_context_menu)

layout = QVBoxLayout()
layout.addWidget(self.history_text)
container = QWidget()
container.setLayout(layout)
self.setCentralWidget(container)

def append_history(self, segment):
"""
将一段完成的字幕(原文 + 译文)追加到历史记录中。
每个新的段落会在历史文本中另起一行。
"""

current = self.history_text.toPlainText()
new_text = current + "\n\n" + segment if current else segment
self.history_text.setPlainText(new_text)
cursor = self.history_text.textCursor()
cursor.movePosition(QTextCursor.MoveOperation.End)
self.history_text.setTextCursor(cursor)

def update_style(self, settings):
"""
更新历史字幕的整体样式(字体、字号、颜色)。
"""

style = f"font-family: {settings['font_family']}; font-size: {settings['font_size']}px; color: {settings['color']};"
self.history_text.setStyleSheet(style)
self.current_history_font = settings["font_family"]
self.current_history_font_size = settings["font_size"]
self.current_history_color = settings["color"]

def show_history_context_menu(self, pos):
menu = QMenu(self.history_text)
settings_action = menu.addAction("设置历史字幕")
save_action = menu.addAction("保存为 TXT")
settings_action.triggered.connect(self.open_history_settings)
save_action.triggered.connect(self.save_history_as_txt)
menu.exec(self.history_text.mapToGlobal(pos))

def open_history_settings(self):
dialog = TextSettingsDialog(self, mode="history", current_settings={
"font_family": self.current_history_font,
"font_size": self.current_history_font_size,
"color": self.current_history_color
})
if dialog.exec() == QDialog.DialogCode.Accepted:
settings = dialog.get_settings()
self.update_style(settings)

def save_history_as_txt(self):
filename, _ = QFileDialog.getSaveFileName(self, "保存历史字幕", "", "Text Files (*.txt);;All Files (*)")
if filename:
with open(filename, "w", encoding="utf-8") as f:
f.write(self.history_text.toPlainText())

###############################################################################
# 字幕叠加窗口:用于在视频上叠加显示实时字幕
# 1. 每次收到 WebSocketThread 的增量字幕,会实时更新(原文 + 译文)。
# 2. 实时字幕背景的透明度以百分比设置(0%为不透明,100%为完全透明)。
# 3. 支持窗口拖动和交互式拖动缩放(使用 QSizeGrip 可改变宽度和高度)。
# 4. 右键菜单中增加“设置实时字幕”和“设置背景”选项。
###############################################################################
class SubtitleOverlayWindow(QMainWindow):
"""
实时字幕窗口:
1. 用于显示从服务端传回的原文和译文的“实时”增量字幕。
2. 当段落结束后,ControlPanelWindow 会将该段落“定型”并写入 HistoryWindow,
同时实时字幕会重新开始接收下一段的内容(即清空并从头显示)。
"""

def __init__(self):
super().__init__()
# 设置窗口为无边框、置顶、工具窗口
self.setWindowFlags(Qt.WindowType.FramelessWindowHint |
Qt.WindowType.WindowStaysOnTopHint |
Qt.WindowType.Tool)
# 允许窗口背景透明
self.setAttribute(Qt.WidgetAttribute.WA_TranslucentBackground)
# 初始背景为黑色且完全不透明(默认不透明,可通过背景设置调整透明度)
self.setGeometry(100, 100, 800, 200)

# 初始最大宽度取窗口宽度
self.max_width = self.width()

# 两个标签用于显示原文与译文(居中,多行显示)
self.orig_label = QLabel("", self)
self.orig_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
self.orig_label.setWordWrap(True)
self.orig_label.setMaximumWidth(self.max_width)
self.trans_label = QLabel("", self)
self.trans_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
self.trans_label.setWordWrap(True)
self.trans_label.setMaximumWidth(self.max_width)

layout = QVBoxLayout()
layout.addWidget(self.orig_label)
layout.addWidget(self.trans_label)
# 增加 QSizeGrip 用于交互式拖动缩放
self.size_grip = QSizeGrip(self)
layout.addWidget(self.size_grip, 0, Qt.AlignmentFlag.AlignRight)

container = QWidget(self)
container.setLayout(layout)
self.setCentralWidget(container)

# 默认实时字幕样式及背景设置
self.rt_font_family = "Arial"
self.rt_font_size = 24
self.rt_color = "#FFFFFF"
self.bg_color = "#000000" # 黑色背景
self.bg_transparency = 80 # 透明度百分比(默认80%,即有效透明度 = (100 - 80)/100 = 0.2)
self.update_style()

# 用于拖动窗口的偏移量
self.offset = None

def update_style(self):
rgb = hex_to_rgb(self.bg_color)
effective_alpha = (100 - self.bg_transparency) / 100.0
bg_style = f"background-color: rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {effective_alpha});"
text_style = f"font-family: {self.rt_font_family}; font-size: {self.rt_font_size}px; color: {self.rt_color};"
style = text_style + bg_style
self.orig_label.setStyleSheet(style)
self.trans_label.setStyleSheet(style)

def setSubtitles(self, orig, trans):
"""
实时更新字幕窗口中的原文和译文文本。
这里的orig和trans是当前尚未“定型”的段落增量。
"""

self.orig_label.setText(orig)
self.trans_label.setText(trans)

# 实现窗口拖动
def mousePressEvent(self, event):
if event.button() == Qt.MouseButton.LeftButton:
self.offset = event.pos()

def mouseMoveEvent(self, event):
if event.buttons() & Qt.MouseButton.LeftButton and self.offset is not None:
self.move(self.pos() + event.pos() - self.offset)

def mouseReleaseEvent(self, event):
self.offset = None

# 右键菜单:设置实时字幕样式和背景设置
def contextMenuEvent(self, event):
menu = QMenu(self)
set_font_action = menu.addAction("设置实时字幕")
set_bg_action = menu.addAction("设置背景")
action = menu.exec(event.globalPos())
if action == set_font_action:
self.open_rt_settings()
elif action == set_bg_action:
self.open_bg_settings()

def open_rt_settings(self):
dialog = TextSettingsDialog(self, mode="rt", current_settings={
"font_family": self.rt_font_family,
"font_size": self.rt_font_size,
"color": self.rt_color
})
if dialog.exec() == QDialog.DialogCode.Accepted:
settings = dialog.get_settings()
self.rt_font_family = settings["font_family"]
self.rt_font_size = settings["font_size"]
self.rt_color = settings["color"]
self.update_style()

def open_bg_settings(self):
dialog = BackgroundSettingsDialog(self, current_color=self.bg_color,
current_transparency=self.bg_transparency)
if dialog.exec() == QDialog.DialogCode.Accepted:
settings = dialog.get_settings()
self.bg_color = settings["bg_color"]
self.bg_transparency = settings["bg_transparency"]
self.update_style()

def resizeEvent(self, event):
self.max_width = self.width()
self.orig_label.setMaximumWidth(self.max_width)
self.trans_label.setMaximumWidth(self.max_width)
super().resizeEvent(event)

###############################################################################
# 控制面板窗口:用于控制录音、转写、翻译,并将字幕同步更新到叠加窗口与历史窗口
# 另外增加退出按钮,退出时关闭程序
###############################################################################
class ControlPanelWindow(QMainWindow):
"""
控制面板窗口:
1. 负责启动和停止录音(AudioRecorder)以及WebSocket通信(WebSocketThread)。
2. 收到实时原文/译文增量后,调用SubtitleOverlayWindow.setSubtitles()进行实时显示。
3. 通过一个定时器(段落间隔)来判断何时“定型”一个段落并写入 HistoryWindow 中。
- 在定时器超时(如2秒无新增量)时,将当前的原文+译文拼接后append到HistoryWindow中。
- 同时标记下一段落开始,清空实时字幕等。
"""

def __init__(self, subtitle_overlay, history_window):
super().__init__()
self.setWindowTitle("控制面板")
self.setGeometry(900, 100, 300, 280) # 稍微调整高度以容纳新控件
self.subtitle_overlay = subtitle_overlay
self.history_window = history_window

self.translation_direction = "en2zh" # 默认英译中
# current_orig_segment 和 current_trans_segment 用于累加“实时增量”字幕
self.current_orig_segment = ""
self.current_trans_segment = ""
# 当一次“定型”完成后,这两个变量会被清空,准备接收下一段内容
self.orig_finalized = False
self.trans_finalized = False

self.audio_queue = queue.Queue(maxsize=10)
self.audio_recorder = None
self.ws_thread = None

# 定时器用于确定段落结束,初始设置2秒(也可以通过段落间隔下拉框来改变)
self.segment_timer = QTimer(self)
self.segment_timer.timeout.connect(self.finalize_segment)
self.segment_timer.setInterval(2000)

# 控制按钮
self.start_button = QPushButton("开始", self)
self.pause_button = QPushButton("暂停", self)
self.pause_button.setEnabled(False)
self.toggle_button = QPushButton("EN → ZH", self)
self.exit_button = QPushButton("退出", self)

self.start_button.clicked.connect(self.start_recording)
self.pause_button.clicked.connect(self.pause_recording)
self.toggle_button.clicked.connect(self.toggle_translation_direction)
self.exit_button.clicked.connect(self.exit_program)

# 新增:段落间隔设置选项(1秒、2秒、3秒)
interval_layout = QHBoxLayout()
interval_label = QLabel("段落间隔:")
self.interval_combo = QComboBox(self)
self.interval_combo.addItems(["1秒", "2秒", "3秒"])
self.interval_combo.setCurrentIndex(1) # 默认选中2秒
self.interval_combo.currentIndexChanged.connect(self.change_timer_interval)
interval_layout.addWidget(interval_label)
interval_layout.addWidget(self.interval_combo)

layout = QVBoxLayout()
layout.addWidget(self.start_button)
layout.addWidget(self.pause_button)
layout.addWidget(self.toggle_button)
layout.addLayout(interval_layout)
layout.addWidget(self.exit_button)

container = QWidget()
container.setLayout(layout)
self.setCentralWidget(container)

def change_timer_interval(self):
# 根据下拉菜单选择的索引(0对应1秒,1对应2秒,2对应3秒)设置定时器间隔
seconds = self.interval_combo.currentIndex() + 1
self.segment_timer.setInterval(seconds * 1000)
print(f"段落间隔已设置为 {seconds} 秒")

def toggle_translation_direction(self):
# 切换翻译方向
self.translation_direction = "zh2en" if self.translation_direction == "en2zh" else "en2zh"
self.toggle_button.setText("EN → ZH" if self.translation_direction == "en2zh" else "ZH → EN")
# 重新启动录音和WebSocket线程以应用新的翻译方向
if self.audio_recorder or self.ws_thread:
self.pause_recording()
self.start_recording()

def exit_program(self):
QApplication.quit()

def start_recording(self):
# 启动录音线程和WebSocket线程
if not self.audio_recorder:
self.audio_recorder = AudioRecorder(self.audio_queue)
self.audio_recorder.start()
if not self.ws_thread:
self.ws_thread = WebSocketThread(self.audio_queue, translation_direction=self.translation_direction)
# 连接信号,以便实时接收增量字幕并更新UI
self.ws_thread.subtitle_signal_orig.connect(self.update_subtitle_orig)
self.ws_thread.subtitle_signal_trans.connect(self.update_subtitle_trans)
self.ws_thread.start()
self.start_button.setEnabled(False)
self.pause_button.setEnabled(True)

def pause_recording(self):
# 停止录音和WebSocket,并将当前段落做一次定型
self.finalize_segment()
if self.audio_recorder:
self.audio_recorder.stop()
self.audio_recorder = None
if self.ws_thread:
self.ws_thread.stop()
self.ws_thread = None
self.start_button.setEnabled(True)
self.pause_button.setEnabled(False)

def update_subtitle_orig(self, text):
"""
当收到实时原文增量时,进行拼接。
如果在前一次finalize_segment()中标记为已定型,则说明这次是新的段落,先清空再拼接。
然后通过SubtitleOverlayWindow.setSubtitles()来实时显示。
"""

if self.orig_finalized:
self.current_orig_segment = ""
self.orig_finalized = False
self.current_orig_segment += text
self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment)
# 每次收到增量后,重新启动段落定时器
self.segment_timer.start()

def update_subtitle_trans(self, text):
"""
当收到实时译文增量时,进行拼接。
如果在前一次finalize_segment()中标记为已定型,则说明这次是新的段落,先清空再拼接。
然后通过SubtitleOverlayWindow.setSubtitles()来实时显示。
"""

if self.trans_finalized:
self.current_trans_segment = ""
self.trans_finalized = False
self.current_trans_segment += text
self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment)
# 每次收到增量后,重新启动段落定时器
self.segment_timer.start()

def finalize_segment(self):
"""
段落定型:
1. 将当前的原文和译文拼接为一段完整字幕并追加到历史窗口中。
2. 标记 orig_finalized 和 trans_finalized = True,表示下一次收到新增量时要清空重来。
3. 停止当前的定时器,等待下次增量时重新开始计时。
"""

if self.current_orig_segment or self.current_trans_segment:
segment = f"{self.current_orig_segment}\n{self.current_trans_segment}"
# 将当前段落写入历史字幕
self.history_window.append_history(segment)
# 标记为已定型,下次更新时会清空
self.orig_finalized = True
self.trans_finalized = True
self.segment_timer.stop()

def main():
app = QApplication(sys.argv)
history_window = HistoryWindow()
history_window.show()
subtitle_overlay = SubtitleOverlayWindow()
subtitle_overlay.show()
control_panel = ControlPanelWindow(subtitle_overlay, history_window)
control_panel.show()
sys.exit(app.exec())

if __name__ == "__main__":
main()


重点注释说明

  1. 实时字幕

    • WebSocketThread 中,收到服务端返回的增量字幕后,通过 subtitle_signal_origsubtitle_signal_trans 信号,交给主线程中的 update_subtitle_origupdate_subtitle_trans 方法。
    • ControlPanelWindowupdate_subtitle_orig / update_subtitle_trans 方法会将新收到的增量文本与当前段落变量(current_orig_segmentcurrent_trans_segment)进行累加,并调用 SubtitleOverlayWindow.setSubtitles 进行实时显示。
    • 每次收到增量时都会重启定时器,用于判断是否需要“定型”这段字幕(即多少秒内不再收到新文本就认为一段结束)。
  2. 历史字幕

    • 当定时器超时(或手动暂停)时,ControlPanelWindow.finalize_segment 会将当前段落中的原文与译文合并,调用 HistoryWindow.append_history 追加到历史字幕窗口中。
    • 一旦“定型”后,下一次增量到来前,会将 current_orig_segment / current_trans_segment 清空,表示开始接收新的一段。

这部分注释能够帮助理解实时字幕和历史字幕的上屏与追加流程:

  • 实时字幕是“增量”地显示,随时更新;
  • 历史字幕是在段落完成后(超时或手动),再将完整结果追加。

o1
You said:
请给我具体的例子来说明这些决策是怎么做出的,更直观一点
 
Reasoned for a few seconds

下面给出一个「更直观、更具象化」的示例流程,帮助你理解“实时字幕”与“历史字幕”这两种上屏方式到底是如何配合的。示例中会模拟一个人说了两句话,并且在句子中出现了若干“增量”返回。


示例场景

  • 翻译方向:EN → ZH(用户在说英文,实时显示英文文本,同时翻译成中文)
  • 段落定时器:2 秒(也就是如果 2 秒内没有再收到任何新增量,就判断为该段落结束,定型到历史字幕)

假设我们有一段录音,用户说了两句话:

  1. “Hello everyone, welcome to the meeting.”
  2. “We’re talking about real-time transcription.”

在录音过程中,服务器会不断返回实时增量(这里简化为可能的返回序列),整个过程可以按以下步骤来理解:


第 1 句话示例流程

  1. 用户开始说话

    • 用户开始说:“He…”,录音线程(AudioRecorder)开始采集音频片段,并且不断将音频数据放入 audio_queue 中。
    • WebSocketThreadaudio_queue 里取出音频,送到服务器实时转录。
  2. 服务器返回增量

    • 初始阶段,声音不完整或只是开口的音节,可能会返回很短的增量,比如:
      • 原文增量:"H"
      • 中文翻译增量:"哈"
    • 由于这是首次收到增量,ControlPanelWindowupdate_subtitle_orig("H") 会清空之前标记的段落(如有),并把当前 current_orig_segment 置为 "H";同理译文 current_trans_segment 置为 "哈"
    • 叠加窗口(SubtitleOverlayWindow)此时显示:
      markdown
      [实时原文]: H
      [实时译文]:
    • 同时重启 2 秒段落定时器(segment_timer),防止它触发段落定型。
  3. 持续收到增量

    • 随着用户说完整的“Hello everyone, welcome…”,服务器会不断修正并返回更精准、更长的增量。例如可能依次返回:
      • "He""Hell""Hello ev""Hello everyone, welc" → …
      • 对应的译文增量也会从 "哈""哈喽""哈喽 大家好, 欢迎..." 不断更新(此处仅举例说明,实际翻译可能不同)。
    • ControlPanelWindow 不断在 update_subtitle_orig 里拼接 current_orig_segment,例如最开始 "H" + "e" = "He",再加 "ll" 就变成 "Hell",以此类推。
    • 关键点:由于不停有新增量进来,所以段落定时器会被一次次地重启,不会在说话过程中自动判定段落结束。
  4. 用户说完第 1 句话,短暂停顿

    • 当用户在句末停顿了一下,服务器不再收到新的音频,或者接收到的音频都是静音。
    • 这时,WebSocketThread 也不会再收到新的“增量”字幕。
    • 经过 2 秒,定时器(segment_timer终于超时,触发 finalize_segment()
    • finalize_segment() 会把当前累积的 current_orig_segment(例如 "Hello everyone, welcome to the meeting.")和 current_trans_segment(对应的中文翻译)合并为一段文本:
      css
      Hello everyone, welcome to the meeting.
      大家好,欢迎参加会议。
    • 调用 HistoryWindow.append_history(...),在历史字幕窗口中追加一个段落。
    • 同时标记:orig_finalized = Truetrans_finalized = True,这样下次有新的增量进来时,就知道要从空白重新开始累加。
  5. 历史字幕窗口的状态

    • HistoryWindow 中,就新增了一条完整段落,用户可以看到已定型的结果:
      css
      Hello everyone, welcome to the meeting.
      大家好,欢迎参加会议。
    • 后面如果继续说话,就会在历史窗口中新起一段。

第 2 句话示例流程

  1. 用户继续说话

    • 过了一小会儿,用户开始说第二句话:“We’re talking about real-time transcription.”
    • 又开始出现新的增量:”W” → “We'” → “We’re t” → “We’re talking a” → …
    • 原文和译文又开始被 update_subtitle_orig / update_subtitle_trans 拼接;ControlPanelWindow 发现 orig_finalized = True 了,所以会先清空 current_orig_segment,再从头累加。
    • 叠加窗口实时显示:
      markdown
      [实时原文]: We're talking about real-time trans...
      [实时译文]: 我们正在讨论实时转写...
    • 段落定时器又一次随着每次增量刷新,不断重启。
  2. 第二句话说完后,再次停顿

    • 同理,2 秒内无新增量,触发 finalize_segment()
    • current_orig_segment 可能是 "We're talking about real-time transcription."
      current_trans_segment 可能是 "我们正在讨论实时转写技术。"
    • 合并并追加到历史字幕:
      vbnet
      We're talking about real-time transcription.
      我们正在讨论实时转写技术。
    • 实时字幕叠加窗口又被清空,等着新一段话出现。

决策逻辑要点

  1. 实时字幕为何一直显示增量

    • 因为每当服务器返回一块新的文本(哪怕是一两个字符的变化),update_subtitle_orig / update_subtitle_trans 都会拿来拼接并更新 SubtitleOverlayWindow.setSubtitles(...)
    • 段落定时器一旦检测到有新增量,就会 start() 重置,让它不会超时并定型。
  2. 历史字幕为何只在定型后追加

    • finalize_segment() 的操作是“用户停顿一段时间”或“点击暂停录音”时才会触发。也就是说,只有在没有新增量的情况下,才判定上一段已经说完了。
    • 这时就把“这一段”所有的原文和译文一次性追加进 HistoryWindow,并在多行文本后加一个空行分隔。
  3. 清空状态

    • finalize_segment() 发生后,就会把 orig_finalizedtrans_finalized 置为 True,意味着当前这一段已经完成,如果下一次收到增量,需要重新开始计数和拼接(所以在下一次增量到来时,我们会先检查 orig_finalized / trans_finalized 是否为 True,如果是就先清空 current_orig_segment / current_trans_segment)。

小结

  • 实时字幕
    “边说边显示”,每一小块增量就更新一次字幕叠加窗口,是短期增量的。

  • 历史字幕
    等一小段话说完,2 秒无更新,就“定型”写入历史字幕中,是长期完成后才追加。

通过以上示例,你可以直观地看到脚本如何利用“实时增量 + 定时器检测停顿”来分别完成实时上屏历史定型两个功能。希望能帮助你更好地理解脚本中的各项决策逻辑。

o1
You said:
Hello everyone, welcome to the meeting. 在这个例子中,定时器是在meeting结束后面启动的吗?
 
Reasoned about segment timer activation for 11 seconds

Understanding timer mechanics

I’m thinking through how the timer resets with each new text chunk. After the last chunk, a 2-second pause finalizes it.

Examining timer progression

I’m piecing together that the timer starts with the first partial increment and resets with each new one, until 2 seconds pass without new increments, marking the end of the phrase.

Clarifying timer mechanics

The timer is activated from the start, resetting with each new text chunk. It finalizes 2 seconds after the last chunk is received, not when “meeting” is completed.

不是在“meeting”这个单词结束后才启动定时器,而是从第一次收到增量字幕开始,定时器就会一直在运行——或者更准确地说,每次收到新的字幕增量后,定时器都会被重置(重新开始计时 2 秒)

换言之:

  1. 当用户开始说“Hello …”并产生了第一段增量(如 "He""Hello e"),定时器就启动/重置。
  2. 只要不断有新的增量(比如从 "Hello eve""Hello every" → …),定时器就持续被重置,不会到达超时。
  3. 当用户说完“meeting.”之后,如果没有新的增量再到来(即用户停顿了),那么定时器不会再被重置,而是继续往下计数,直到 2 秒到达超时。
  4. 一旦定时器超时(2 秒无新增量),就会触发“定型”操作(finalize_segment()),把当前这一整句“Hello everyone, welcome to the meeting.” 追加到历史字幕里去。

因此,定时器并不是等到“meeting”这个词结束才开始,而是全程都在工作;只不过当收到增量时会被重置,只有当增量不再出现时,才会最终倒数到 0,完成定型。

o1
You said:
也就是说,如果Hello everyone, welcome to the meeting. 的过程中,如果welcome to the meeting 是新的增量,那从welcome to the meeting 的最开始开始计时2秒,如果没有增量,就不会定型,是吗
 
Reasoned for 6 seconds

Aligning logical steps

I’m confirming that each new increment in the script logic triggers segment_timer.start(), resetting the timer to zero for the next 2-second period.

Maintaining timer consistency

I’m indicating that the timer resets to 2 seconds for each new chunk, and if no further increments arrive within 2 seconds, it finalizes.

是的,你的理解是正确的。脚本的逻辑可以简单理解为:

  1. 每当收到一条新的字幕增量(比如从服务器返回了一小段新的文字),就调用 segment_timer.start() 来重新开始计时 2 秒。
  2. 如果在这 2 秒之内又收到新的增量,那计时又被重新归零、再等下一次 2 秒。
  3. 只有当超过 2 秒都没再收到新的增量,才会调用 finalize_segment(),把当前这段字幕“定型”并写进历史字幕。

所以,假设:

  • 先前收到增量是“… Hello everyone, ”,定时器立刻开始 2 秒倒计时
  • 又在 1 秒后(举例)收到增量 "welcome to the meeting."定时器被重置,重新开始再等 2 秒。
  • 如果后面一直没有增量来“打断”这个计时,等 2 秒倒数完毕,就会触发“定型”。

同理,如果在快到 2 秒超时时,又出现了下一块增量(比如 "." 或者修正翻译等),依然会把计时重置,再次等 2 秒。所以只有在完全没有新增量来时,2 秒超时才会真正发生,字幕段落才会被“定型”到历史字幕窗口里去。

o1
You said:
目前历史字幕有一个问题,就是它容易在没有标点符号时就断句,如: Uh, I’m, I’m, I’m very, uh, grateful to all of you who who might be out there listening to me tonight. The only only problem with doing this online compared with doing it in Noma State Hall as I used to, uh, or in other settings is that I don’t get to 我非常感谢所有今晚可能在外面听我讲话的人。在网上做这件事的唯一问题是,与我过去在诺玛市政厅做这件事相比,或在其他环境中,我无法 be right with you in the room, experience you more directly and see the looks on your faces, um, get inspired by how how you’re responding to things. 与你在房间里,更直接地体验你,看到你们的表情,从你们对事情的反应中获得灵感。我想要改变代码,让历史字幕呈现完整的句子,要加入什么规则
 
Reasoned about adding sentence completion rules for 14 seconds

Ensuring subtitle accuracy

OK, let me see. The user is looking to keep subtitles intact with clear punctuation. This reflects progress in refining subtitle rendering for a more professional presentation.

Ensuring complete sentences

I’m working through a rule to finalize subtitles only after detecting punctuation, focusing on “update_subtitle_orig” and “update_subtitle_trans” functions to maintain sentence structure.

Piecing together subtitling logic

I’m working through adding rules to finalize subtitles after punctuation in “update_subtitle_orig,” considering both punctuation and time out triggers for a seamless, professional effect.

Refining subtitling logic

I’m digging into the finalize_segment method to enhance subtitle accuracy. By checking punctuation and waiting a bit longer, it ensures clarity and completeness before finalizing.

Enhancing subtitle precision

I’m planning to streamline the code with a smaller buffer for partial sentences, finalizing them when punctuation is detected or after a brief pause, ensuring concise and accurate subtitles.

Finalizing subtitles

OK, let me see. By tweaking “update_subtitle_orig” or “update_subtitle_trans” to check if text ends with punctuation, I’m piecing together a clear and concise guide for finalizing subtitles.

在现有逻辑里,历史字幕的“定型”完全依赖于“在一段时间(如2秒)内没有收到新的增量” 来判定——只要没有新增量,就触发 finalize_segment() 并把当前段落写入历史字幕。
如果在没有任何停顿或标点的情况下持续地有语音传入,那么“定型”依然只会根据静音检测/超时来进行,可能会出现你提到的「句子还没说完就被分段写入历史字幕」的现象。


思路:利用“句末标点”来辅助判断段落结束

你希望“只有当出现完整的句子”才定型到历史字幕。这通常意味着你要在检测到句子结束标志(如标点)时再进行段落分割,而不仅仅依靠时间戳(2秒停顿)**。最简单的做法是:

  1. 增量收到后,检查结尾是否包含主要标点符号(如 . ? ! 或中文的 。?! 等)。
  2. 如果检测到句末标点,则主动调用 finalize_segment(),将这一句写入历史字幕,然后清空当前段落缓存。
  3. 如果没有标点也没有停顿,则保持原有逻辑,用 2 秒定时器或人工“暂停”来兜底,以免用户一直在说话都不打标点,你就永远不定型。

核心改动示例

在你现有的代码里,最合适放置这类判断的地方是 ControlPanelWindow 中的 update_subtitle_origupdate_subtitle_trans(具体放在哪个取决于你想依据原文还是译文的标点来判断句子结束)。以下举例放在 update_subtitle_orig

python
def update_subtitle_orig(self, text):
# 1) 如果前一次已经 finalize 过,说明是新段落,就先清空缓存
if self.orig_finalized:
self.current_orig_segment = ""
self.orig_finalized = False

# 2) 把新的增量拼接进 current_orig_segment
self.current_orig_segment += text

# 3) 更新实时字幕
self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment)

# 4) 重启段落定时器(旧逻辑)
self.segment_timer.start()

# -------------------------
# 5) 新增:检查句末标点
# 设定一个可能的“句末标点”集合,可以根据语言适当扩展
sentence_endings = {'.', '?', '!', '。', '?', '!'}

# 去除增量拼接后两端的空格
trimmed = self.current_orig_segment.strip()
if trimmed and trimmed[-1] in sentence_endings:
# 如果最后一个字符就是句末标点,说明可能说完了一整句
self.finalize_segment()

这样,只要检测到文本已经以句末标点结尾,就会立即触发 finalize_segment(),把这段文本写入历史字幕。

注意:

  • 这种“碰到标点就立即定型”有时会导致过多切分(比如用户说一句话里包含多个句号或省略号),需要视具体需求做更细的处理。
  • 如果用户讲话时没有使用任何标点(比如口头表达混乱,没有停顿),那么仍然会 fallback 到2秒定时器去兜底,让这段话在停顿后定型。
  • 如果需要兼顾中英文标点,还可以扩展 sentence_endings,或者增加更复杂的文本分析(NLP 分句、语音语调等)。

更进一步:只在“标点 + 停顿”才定型

上面给出的示例是“一看到标点就立刻定型”。有些人会希望“看到标点 + 确认确实停顿了一会儿”才定型,以免句中出现“… but …”这种上下文依旧的情况时过早分段。那就可以把标点检测计时器结合起来,比如:

  1. 当收到增量后,如果检测到末尾是标点,就开始一个“短的确定性计时器”(如 500 ms)而非直接 finalize;
  2. 如果在 500 ms 之内又来了新的增量,说明用户还有话没说完,暂时不定型;否则 500 ms 后 finalize。
  3. 如果根本没有标点出现,则还是用原来的 2 秒定时器来兜底。

这样的逻辑更贴近实际人类说话的停顿特征,但实现也相对复杂,需要管理两个定时器或者写一些状态机。


小结

  • 核心思路:在原本 “2 秒无增量自动定型” 的规则基础上,额外加入“检测句末标点” 的逻辑,以便让历史字幕更贴近“完整句子”的呈现。
  • 最简单做法:在收到增量的函数(update_subtitle_origupdate_subtitle_trans)里,检查拼接好的文本是否以 ., ?, !, 等 结尾,一旦匹配就立刻调用 finalize_segment()
  • 进阶做法:用“标点 + 轻微停顿”来判定,避免误切分。

根据你的业务需求,可以灵活改进这些规则,以获得更自然的历史字幕分段。

o1
You said:
服务器是在返回第二段增量的开头给出标点符号:Received raw message: {“event_id”:”event_021740671646412000000000000000000000000000000007fea43″,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:” will be of of value in in your life”,”language”:”en”,”start_ms”:4002480,”end_ms”:4005360} Parsed event: {‘event_id’: ‘event_021740671646412000000000000000000000000000000007fea43’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘ will be of of value in in your life’, ‘language’: ‘en’, ‘start_ms’: 4002480, ‘end_ms’: 4005360} Received raw message: {“event_id”:”event_02174067164641200000000000000000000000000000000ef49c2″,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”会对你的生活有价值”,”language”:”zh”,”start_ms”:4002480,”end_ms”:4005360} Parsed event: {‘event_id’: ‘event_02174067164641200000000000000000000000000000000ef49c2’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘会对你的生活有价值’, ‘language’: ‘zh’, ‘start_ms’: 4002480, ‘end_ms’: 4005360} Received raw message: {“event_id”:”event_02174067164784500000000000000000000000000000000fd0274″,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”. Thank you very much”,”language”:”en”,”start_ms”:4005360,”end_ms”:4007120} Parsed event: {‘event_id’: ‘event_02174067164784500000000000000000000000000000000fd0274’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘. Thank you very much’, ‘language’: ‘en’, ‘start_ms’: 4005360, ‘end_ms’: 4007120} Received raw message: {“event_id”:”event_021740671647845000000000000000000000000000000007eb9bc”,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”。非常感谢”,”language”:”zh”,”start_ms”:4005360,”end_ms”:4007120} Parsed event: {‘event_id’: ‘event_021740671647845000000000000000000000000000000007eb9bc’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘。非常感谢’, ‘language’: ‘zh’, ‘start_ms’: 4005360, ‘end_ms’: 4007120} Received raw message: {“event_id”:”event_0217406716487810000000000000000000000000000000089124d”,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”. Bye bye”,”language”:”en”,”start_ms”:4007120,”end_ms”:4008560} Parsed event: {‘event_id’: ‘event_0217406716487810000000000000000000000000000000089124d’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘. Bye bye’, ‘language’: ‘en’, ‘start_ms’: 4007120, ‘end_ms’: 4008560} Received raw message: {“event_id”:”event_0217406716487810000000000000000000000000000000027de92″,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”。再见”,”language”:”zh”,”start_ms”:4007120,”end_ms”:4008560} Parsed event: {‘event_id’: ‘event_0217406716487810000000000000000000000000000000027de92’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘。再见’, ‘language’: ‘zh’, ‘start_ms’: 4007120, ‘end_ms’: 4008560} Received raw message: {“event_id”:”event_021740671649894000000000000000000000000000000002faba7″,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”.”,”language”:”en”,”start_ms”:4008560,”end_ms”:4008560} Parsed event: {‘event_id’: ‘event_021740671649894000000000000000000000000000000002faba7’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘.’, ‘language’: ‘en’, ‘start_ms’: 4008560, ‘end_ms’: 4008560} Received raw message: {“event_id”:”event_021740671649894000000000000000000000000000000001e9823″,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575″,”delta”:”。”,”language”:”zh”,”start_ms”:4008560,”end_ms”:4008560} Parsed event: {‘event_id’: ‘event_021740671649894000000000000000000000000000000001e9823’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_021740667640002d6048d48ac842f8da79fa596a47813674ac575’, ‘delta’: ‘。’, ‘language’: ‘zh’, ‘start_ms’: 4008560, ‘end_ms’: 4008560}
 
Reasoned for 19 seconds

Handling punctuation

I’m curious how to manage punctuation at the start of incremental segments, likely through parsing or string manipulation. This makes me think that adjusting logic could ensure text clarity.

Adjusting punctuation rules

I’m refining logic to manage punctuation at the start of deltas. The idea is to append or move it to the old text, ensuring the model recognizes sentence boundaries.

Refining punctuation handling

I’m considering removing punctuation from the start of the next chunk and adding it to the previous text to manage streaming ASR output effectively.

Managing punctuation flow

I’m mapping out how to handle punctuation from new chunks, ensuring it fits seamlessly with previous text by either removing leading punctuation or unifying punctuation with the earlier segment.

Streamlining punctuation handling

When a new chunk starts with punctuation and the old chunk lacks it, I’m suggesting simplifying by appending the punctuation to the end of the previous text segment.

Strategizing punctuation

I’m balancing ephemeral checks and post-merge cleanup for punctuation at new chunk starts, aiming to seamlessly integrate it into the previous segment while considering user preferences and subtitle update strategies.

在某些实时转录服务的流式返回中,确有可能出现这样的情况:服务器把句末的标点符号(如 ".")放进了下一条增量的开头部分,而不是追加到上一条增量的末尾。结果就导致我们在拼接时出现了类似于:

  • 第一次增量:" will be of of value in in your life"
  • 第二次增量:". Thank you very much"

于是文本看起来好像多了一个“多余的空格或换行”,或者说标点和上一句没连到一起,出现了“断句”不自然的现象。


原因:流式 ASR 的增量分块不总是按标点对齐

很多实时 ASR 或翻译引擎内部都有“增量修正”的机制,它可能先给出一句不带标点的结果,然后下一次增量中才补上标点。这并不罕见,因为流式识别在语音结束或检测到句子边界后才知道应该加什么标点,也有可能标点放在下一增量开头返回。


常见解决思路

1. 简单做法:直接拼接,不额外处理

最简单的方法就是:不做额外逻辑,让脚本按照默认方式累加文本。

  • 优点:逻辑最直白,不会破坏原文增量本身的顺序。
  • 缺点:容易在历史字幕里出现“上一句没标点,下一句开头是标点”这种不优雅的拼接结果。

如果你对字幕的美观程度要求不高,或者后续会有人做“人工校对”,直接保留这类增量的顺序也未尝不可。


2. 在拼接增量时,检测“标点在开头”,将其移到上一段末尾

如果你希望“标点”和上一句更紧密地连在一起,可以在 update_subtitle_orig(或者 update_subtitle_trans)里做一个小小的预处理逻辑:

示例代码(放在 ControlPanelWindowupdate_subtitle_orig 里):

python
def update_subtitle_orig(self, text):
# 如果前一次 finalize 过,则说明是新段落,需要先清空
if self.orig_finalized:
self.current_orig_segment = ""
self.orig_finalized = False

# --- 新增:检测 text 是否以句号/感叹号/问号开头 ---
# 设定我们关心的标点符号
punctuation_chars = {'.', '!', '?'}
# 也可兼顾中文标点,如 punctuation_chars = {'.', '!', '?', '。', '!', '?'}

# 去除前后的空格
trimmed_text = text.lstrip() # 先只去左边空格,保留右边以免破坏中英翻译的空格
if (trimmed_text and trimmed_text[0] in punctuation_chars
and self.current_orig_segment
and self.current_orig_segment[-1].isalnum() # 上一个增量结尾是字母或数字
):
# 比如本次增量是 ". Thank you very much"
# 那么把 '.' 从开头拿走,合并进当前段落的末尾
punctuation = trimmed_text[0]
# 更新 current_orig_segment,把 punctuation 拼到结尾
self.current_orig_segment = self.current_orig_segment.rstrip() + punctuation
# 同时把 punctuation 从增量的开头去掉
trimmed_text = trimmed_text[1:].lstrip()

# 现在拼接剩余部分到 current_orig_segment
self.current_orig_segment += trimmed_text

# 更新实时字幕
self.subtitle_overlay.setSubtitles(self.current_orig_segment, self.current_trans_segment)

# 重启计时器
self.segment_timer.start()

这样,当我们遇到下一次增量是 ". Thank you very much" 时,就会把 . 拿出来拼到上一段的末尾,变成:

  • 上一段末尾:" will be of of value in in your life."
  • 当前增量实际拼接到段落里的部分:" Thank you very much"

当然,这种逻辑是非常“简易”且“人工指定”的,针对某些特定场景(比如只关心英文句号/问号/感叹号)才奏效,且仅在这种**“上一段不带标点 + 新增量开头是标点”**的场景里处理。其余更复杂的标点情形(如省略号 ...、分号 ;、中文顿号 等)你可能要自行扩展。


3. 对所有增量做进一步的后处理合并

如果字幕对美观度要求更高,还可以做更深入的后处理,例如:

  1. 定期扫描累积的增量,看看是否有多余空格或标点需要合并或替换。
  2. 检测句中反复出现的修正,例如 ASR 先给 “Hello worl”,后又修正成 “Hello world”,把多余文本去重。
  3. 根据语言学规则,对句子进行分词、合并、断句,或进行部分 NLP 分析。

但这就属于更复杂的文本后处理范畴,代码也会更长,不再是简单的几行拼接逻辑了。


小结

  • 服务器把句末标点放到下一增量开头十分常见
  • 如果想要让标点更紧密地跟随上一句,可以在 update_subtitle_origupdate_subtitle_trans 里做一个简单的“检测增量开头标点然后挪到上一段末尾”的小逻辑。
  • 更高级的方案可以做更复杂的文本后处理,但需要更多规则或 NLP 分析。

这能让“. Thank you very much”这种情况看起来更自然,更接近人工标点的阅读体验。根据你的业务需求和实现成本,选择合适的方案即可。

o1
You said:
帮我分析一下:Parsed event: {‘event_id’: ‘event_02174067258138900000000000000000000000000000000363ef4’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘,也是一名建筑师,有趣的是’, ‘language’: ‘zh’, ‘start_ms’: 245480, ‘end_ms’: 248040} Received raw message: {“event_id”:”event_02174067258299700000000000000000000000000000000885b7a”,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:”, trained architect”,”language”:”en”,”start_ms”:248040,”end_ms”:248840} Parsed event: {‘event_id’: ‘event_02174067258299700000000000000000000000000000000885b7a’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘, trained architect’, ‘language’: ‘en’, ‘start_ms’: 248040, ‘end_ms’: 248840} Received raw message: {“event_id”:”event_02174067258299700000000000000000000000000000000ed01e5″,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:”,他是一名受过训练的建筑师”,”language”:”zh”,”start_ms”:248040,”end_ms”:248840} Parsed event: {‘event_id’: ‘event_02174067258299700000000000000000000000000000000ed01e5’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘,他是一名受过训练的建筑师’, ‘language’: ‘zh’, ‘start_ms’: 248040, ‘end_ms’: 248840} 你看这里”建筑师“后没有句号,但是 这是历史字幕” Frederick John Hopman, yeah, with a JAN. He was a Dutchman 弗雷德里克·约翰·霍普曼,对,有一个JAN。他是荷兰人 who was also an architect, interestingly enough, trained architect. ,也是一名建筑师,有趣的是,他是一名受过训练的建筑师。““这个是怎么加上去的
 
Reasoned for 22 seconds

Noticing punctuation accuracy

Interestingly enough, the ASR/Translation pipeline is returning partial increments, potentially leaving out or misplacing punctuation like periods. This makes me think that extra punctuation might be added later, possibly influenced by the user’s code logic or extra pauses.

Wondering about punctuation

Considering if the final punctuation was auto-added. It seems likely that code logic or server data sent later possibly added the “。” after the last increment.

Wondering about translation process

The final period might come from post-processing. There might be code ensuring segments end with a period. Another possibility is a final chunk from the server correcting or updating the translation.

Identifying the final touch

I’m thinking the final period might be coming from system tweaks, user code, or ASR updates, ensuring each segment is punctuated correctly.

在你提供的日志里,服务器返回的增量中,的确没有看见“句号”或“。”出现在“他是一名受过训练的建筑师”这段文本的末尾,但最终历史字幕里却出现了一个句号。这常见于流式识别中后期的增量修正或者段落定型的后处理导致。下面给你几个常见原因,看看哪一种最可能和你的实际情况相符。


一、服务器后续还发了一个带句号或“。”的增量

有时服务器会在后续增量里才“补”上句号。

  • 例如,服务器可能先发了增量 ,他是一名受过训练的建筑师(没有句号),
  • 隔一会儿又发了另一条增量:". " 或者 "。" 来修正/补全标点。
  • 由于你的日志可能截取有限,或者在打印时只关注了中间几条增量,最后那条带标点的增量没有放出来。

如果是这种情况,那么当 WebSocketThread 收到带句号的增量后,就会继续拼接到当前段落末尾,于是历史字幕就带上了那句句号。

如何确认:可以检查更完整的日志输出,看是否有类似:

json
{"delta": "."}

或者

json
{"delta": "。"}

的增量在之后出现。


二、定时器“定型”时做了后处理,自动补句号

有的开发者在调用 finalize_segment() 把段落写入历史字幕前,会做一个简单的后处理,如果句尾没有任何标点,就自动补一个 . 或者

  • 这样做的目的是让历史字幕显示更“完整”,哪怕流式识别没给标点,也要来一个。
  • 如果你或者你的团队修改过 finalize_segment(),在其中写了类似:
    python
    if not current_orig_segment.endswith(('.', '!', '?', '。', '!', '?')):
    current_orig_segment += '。'

    那么无论最后收到的增量有没有标点,历史字幕都会被补上一个句号。

如何确认:看一下 ControlPanelWindow.finalize_segment() 有没有这样的逻辑。


三、服务器或翻译引擎做了内部“增量修正”(in-place update)

有些流式服务,早先的增量会被后来的增量覆盖(或者合并)掉。

  • 比如第一次收到中文翻译是 ,他是一名受过训练的建筑师
  • 等到引擎分析完更多上下文后,可能会发一条新的“修正”增量:",他是一名受过训练的建筑师。"(多了一个句号),并且覆盖掉之前那句翻译。
  • 这种情况通常伴随 start_msend_ms 相同或相近,但 delta 内容更新了的情况。

如何确认:你可以看一下同一个 start_ms ~ end_ms 的后续事件有没有一条带句号的新增量。如果有,就说明是引擎自己修正了结果。


四、人工/其他逻辑拼接到上一句里的标点

若你在 update_subtitle_transupdate_subtitle_orig 里做过人工逻辑判断:

  • 如果下一次增量开头出现了标点(比如 ". "),就把它“挪”到上一句末尾。
  • 或者如果用户停顿了,但上一句没有标点,就给它加上一个“。”。

这些做法也会导致最后历史字幕里比增量多出一个标点。这在脚本后处理里非常常见,用来让字幕看起来更顺畅。


小结

从你贴的日志来看,这些增量的确都没有句号,但是实际历史字幕里却出现了“。”,最常见的原因就是后面又来了一个带句号的增量,或者段落定型时做了自动补句号处理。你可以:

  1. 查看后续日志,看是否有新的增量带了句号。
  2. 检查代码finalize_segment()update_subtitle_trans 等函数,是否有人为地追加句号。
  3. 如果代码里没有,日志里也确实找不到带句号的增量,那么很可能是你的流式翻译服务在最后 in-place 修正了翻译结果,或者做了内部合并增量。

最简单的验证方法就是把整个日志输出(包括最后一次增量)都翻一遍,看有没有显示 "delta": "。""delta": "."。绝大多数情况下,都会找到这条“补标点”的增量。

o1
You said:
给你更完整的,你找出这个在哪:Parsed event: {‘event_id’: ‘event_02174067258299700000000000000000000000000000000ed01e5’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘,他是一名受过训练的建筑师’, ‘language’: ‘zh’, ‘start_ms’: 248040, ‘end_ms’: 248840} Received raw message: {“event_id”:”event_021740672584325000000000000000000000000000000007cd24c”,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:”.”,”language”:”en”,”start_ms”:248840,”end_ms”:249000} Parsed event: {‘event_id’: ‘event_021740672584325000000000000000000000000000000007cd24c’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘.’, ‘language’: ‘en’, ‘start_ms’: 248840, ‘end_ms’: 249000} Received raw message: {“event_id”:”event_02174067258432500000000000000000000000000000000a625e8″,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:”。”,”language”:”zh”,”start_ms”:248840,”end_ms”:249000} Parsed event: {‘event_id’: ‘event_02174067258432500000000000000000000000000000000a625e8’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘。’, ‘language’: ‘zh’, ‘start_ms’: 248840, ‘end_ms’: 249000} Received raw message: {“event_id”:”event_02174067258679200000000000000000000000000000000a8cb59″,”type”:”response.input_audio_transcription.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:” Um, cause I have them, I then went on and got a degree in architecture”,”language”:”en”,”start_ms”:249000,”end_ms”:253640} Parsed event: {‘event_id’: ‘event_02174067258679200000000000000000000000000000000a8cb59’, ‘type’: ‘response.input_audio_transcription.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘ Um, cause I have them, I then went on and got a degree in architecture’, ‘language’: ‘en’, ‘start_ms’: 249000, ‘end_ms’: 253640} Received raw message: {“event_id”:”event_021740672586792000000000000000000000000000000008827aa”,”type”:”response.input_audio_translation.delta”,”response_id”:”resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565″,”delta”:”因为我有他们,我后来继续攻读了建筑学学位”,”language”:”zh”,”start_ms”:249000,”end_ms”:253640} Parsed event: {‘event_id’: ‘event_021740672586792000000000000000000000000000000008827aa’, ‘type’: ‘response.input_audio_translation.delta’, ‘response_id’: ‘resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565’, ‘delta’: ‘因为我有他们,我后来继续攻读了建筑学学位’, ‘language’: ‘zh’, ‘start_ms’: 249000, ‘end_ms’: 253640}
 
Reasoned for 35 seconds

Tracing punctuation

I’m piecing together that the user is looking to pinpoint where a specific period or punctuation mark was located.

Tracking event progression

I’m mapping out how the event unfolds, identifying a notable punctuation mark after a linked message between two consecutive chunks of voice data.

从你贴出的更完整日志里可以明显看出,句号“。”确实是出现在这条增量中(第 3、4 行的原始消息及解析结果):

plaintext
Received raw message: {"event_id":"event_021740672584325000000000000000000000000000000007cd24c",
"type":"response.input_audio_transcription.delta",
"response_id":"resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565",
"delta":".",
"language":"en","start_ms":248840,"end_ms":249000}

Parsed event: {'event_id': 'event_021740672584325000000000000000000000000000000007cd24c',
'type': 'response.input_audio_transcription.delta',
'response_id': 'resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565',
'delta': '.',
'language': 'en',
'start_ms': 248840,
'end_ms': 249000}

Received raw message: {"event_id":"event_02174067258432500000000000000000000000000000000a625e8",
"type":"response.input_audio_translation.delta",
"response_id":"resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565",
"delta":"。",
"language":"zh",
"start_ms":248840,
"end_ms":249000}

Parsed event: {'event_id': 'event_02174067258432500000000000000000000000000000000a625e8',
'type': 'response.input_audio_translation.delta',
'response_id': 'resp_0217406723327297709ab0187905af327754557baa6b6cfb6b565',
'delta': '。',
'language': 'zh',
'start_ms': 248840,
'end_ms': 249000}

也就是说,服务器在上一条增量里只返回了

arduino
",他是一名受过训练的建筑师"

并没有句号
然后在下一条增量(这里 start_ms=248840)里补上了

json
"delta": "。"

这正是“句号”来源所在,也正是为什么最终在历史字幕里会出现句号。

结论:增量中的句号并不一定与上一条文字同时出现,常常会分成独立增量(或者延迟到下一次增量)才补齐标点。当脚本把它继续拼接到 current_trans_segment 之后,历史字幕里就自然带上了这个句号。

留下回复

Author Name

Author Email

标题 *

文件不超过 2MB

 
Preview 0 Revisions Saved
Share: