ReazonSpeechをPySide6から使う

はじめに

ReazonSpeechを使うためにC#でGUIを作ったり、Gradioを使ったりしました。
touch-sp.hatenablog.com
touch-sp.hatenablog.com
今回はPySide6を使ってみました。

PC環境

Windows 11
Python 3.11
CUDA 11.8

Python環境構築

pip install torch==2.5.0+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install cython
git clone https://github.com/reazon-research/ReazonSpeech
pip install ReazonSpeech/pkg/nemo-asr
pip install pyside6

Pythonスクリプト

import sys
from pathlib import Path
from datetime import datetime
from PySide6.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QTextEdit
from PySide6.QtCore import QUrl
from PySide6.QtMultimedia import QMediaRecorder, QMediaCaptureSession, QAudioInput, QMediaFormat
from reazonspeech.nemo.asr import load_model, transcribe, audio_from_path

model = load_model()

class AudioTranscriptionApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("音声録音と文字起こし")
        self.setGeometry(100, 100, 400, 300)

        layout = QVBoxLayout()

        self.record_button = QPushButton("録音開始")
        self.record_button.clicked.connect(self.toggle_recording)
        layout.addWidget(self.record_button)

        self.transcribe_button = QPushButton("文字起こし")
        self.transcribe_button.clicked.connect(self.transcribe_audio)
        self.transcribe_button.setEnabled(False)
        layout.addWidget(self.transcribe_button)

        self.text_edit = QTextEdit()
        layout.addWidget(self.text_edit)

        container = QWidget()
        container.setLayout(layout)
        self.setCentralWidget(container)

        # メディアコンポーネントの設定
        self.audio_input = QAudioInput()
        self.media_recorder = QMediaRecorder()
        self.capture_session = QMediaCaptureSession()
        self.capture_session.setAudioInput(self.audio_input)
        self.capture_session.setRecorder(self.media_recorder)

        # 録音フォーマットの設定
        audio_settings = self.media_recorder.mediaFormat()
        audio_settings.setFileFormat(QMediaFormat.FileFormat.Wave)
        audio_settings.setAudioCodec(QMediaFormat.AudioCodec.Wave)
        self.media_recorder.setMediaFormat(audio_settings)

        # 品質設定
        self.media_recorder.setQuality(QMediaRecorder.Quality.HighQuality)

        self.is_recording = False
        self.temp_file = None

    def toggle_recording(self):
        if not self.is_recording:
            self.start_recording()
        else:
            self.stop_recording()

    def start_recording(self):
        now = datetime.now()
        current_time = now.strftime("%Y%m%d_%H%M%S")
        current_directory=Path.cwd()
        self.temp_file = Path(current_directory, f"{current_time}.wav")

        self.media_recorder.setOutputLocation(QUrl.fromLocalFile(self.temp_file))
        self.media_recorder.record()
        self.is_recording = True
        self.record_button.setText("録音停止")
        self.transcribe_button.setEnabled(False)

    def stop_recording(self):
        self.media_recorder.stop()
        self.is_recording = False
        self.record_button.setText("録音開始")
        self.transcribe_button.setEnabled(True)

    def transcribe_audio(self):
        if self.temp_file:
            print(self.temp_file)
            audio = audio_from_path(self.temp_file)

            ret = transcribe(model, audio)
            self.text_edit.append(ret.text)

            self.temp_file = None

if __name__ == "__main__":
    app = QApplication(sys.argv)
    window = AudioTranscriptionApp()
    window.show()
    sys.exit(app.exec())

実行画面






このエントリーをはてなブックマークに追加