はじめに
ReazonSpeechを使うためにC#でGUIを作ったり、Gradioを使ったりしました。touch-sp.hatenablog.com
touch-sp.hatenablog.com
今回はPySide6を使ってみました。
PC環境
Windows 11 Python 3.11 CUDA 11.8
Python環境構築
pip install torch==2.5.0+cu118 --index-url https://download.pytorch.org/whl/cu118 pip install cython git clone https://github.com/reazon-research/ReazonSpeech pip install ReazonSpeech/pkg/nemo-asr pip install pyside6
Pythonスクリプト
import sys from pathlib import Path from datetime import datetime from PySide6.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QTextEdit from PySide6.QtCore import QUrl from PySide6.QtMultimedia import QMediaRecorder, QMediaCaptureSession, QAudioInput, QMediaFormat from reazonspeech.nemo.asr import load_model, transcribe, audio_from_path model = load_model() class AudioTranscriptionApp(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("音声録音と文字起こし") self.setGeometry(100, 100, 400, 300) layout = QVBoxLayout() self.record_button = QPushButton("録音開始") self.record_button.clicked.connect(self.toggle_recording) layout.addWidget(self.record_button) self.transcribe_button = QPushButton("文字起こし") self.transcribe_button.clicked.connect(self.transcribe_audio) self.transcribe_button.setEnabled(False) layout.addWidget(self.transcribe_button) self.text_edit = QTextEdit() layout.addWidget(self.text_edit) container = QWidget() container.setLayout(layout) self.setCentralWidget(container) # メディアコンポーネントの設定 self.audio_input = QAudioInput() self.media_recorder = QMediaRecorder() self.capture_session = QMediaCaptureSession() self.capture_session.setAudioInput(self.audio_input) self.capture_session.setRecorder(self.media_recorder) # 録音フォーマットの設定 audio_settings = self.media_recorder.mediaFormat() audio_settings.setFileFormat(QMediaFormat.FileFormat.Wave) audio_settings.setAudioCodec(QMediaFormat.AudioCodec.Wave) self.media_recorder.setMediaFormat(audio_settings) # 品質設定 self.media_recorder.setQuality(QMediaRecorder.Quality.HighQuality) self.is_recording = False self.temp_file = None def toggle_recording(self): if not self.is_recording: self.start_recording() else: self.stop_recording() def start_recording(self): now = datetime.now() current_time = now.strftime("%Y%m%d_%H%M%S") current_directory=Path.cwd() self.temp_file = Path(current_directory, f"{current_time}.wav") self.media_recorder.setOutputLocation(QUrl.fromLocalFile(self.temp_file)) self.media_recorder.record() self.is_recording = True self.record_button.setText("録音停止") self.transcribe_button.setEnabled(False) def stop_recording(self): self.media_recorder.stop() self.is_recording = False self.record_button.setText("録音開始") self.transcribe_button.setEnabled(True) def transcribe_audio(self): if self.temp_file: print(self.temp_file) audio = audio_from_path(self.temp_file) ret = transcribe(model, audio) self.text_edit.append(ret.text) self.temp_file = None if __name__ == "__main__": app = QApplication(sys.argv) window = AudioTranscriptionApp() window.show() sys.exit(app.exec())