はじまりの大地

2024-07-15 09:14:04 +09:00
@@ -0,0 +1,28 @@
+import { TranscriptionEngine } from '../transcription-engine.js'
+
+export const engines: TranscriptionEngine[] = [
+  {
+    name: 'openai-whisper',
+    description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    language: 'python',
+    type: 'binary',
+    command: 'whisper',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'PyTorch' ],
+    languageDetection: true,
+    version: '20231117'
+  },
+  {
+    name: 'whisper-ctranslate2',
+    description: 'Whisper command line client compatible with original OpenAI client based on CTranslate2.',
+    language: 'python',
+    type: 'binary',
+    command: 'whisper-ctranslate2',
+    forgeURL: 'https://github.com/Softcatala/whisper-ctranslate2',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true,
+    version: '0.4.4'
+  }
+]
@@ -0,0 +1,3 @@
+export * from './transcriber/index.js'
+export * from './engines.js'
+export * from './whisper-builtin-model.js'
@@ -0,0 +1,68 @@
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import assert from 'node:assert'
+import { lstat } from 'node:fs/promises'
+import { TranscribeArgs } from '../../abstract-transcriber.js'
+import { TranscriptFile } from '../../transcript-file.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import { OpenaiTranscriber } from './openai-transcriber.js'
+
+export class Ctranslate2Transcriber extends OpenaiTranscriber {
+
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format,
+    transcriptDirectory,
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = this.getExec(this.getExecEnv())
+
+    if (model.path) {
+      assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.')
+    }
+
+    const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.getEngineBinary()} ${[
+      mediaFilePath,
+      ...modelArgs,
+      '--word_timestamps',
+      'True',
+      '--vad_filter',
+      'true',
+      // Better precision with 5s of audio
+      // We mainly use vad_filter to improve language detection (first 30 seconds of the video, so no voice is problematic)
+      '--vad_min_silence_duration_ms',
+      '5000',
+      '--output_format',
+      'all',
+      '--output_dir',
+      transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(transcriptDirectory, mediaFilePath),
+      path: this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, format),
+      format
+    })
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'CTranslate2'
+  }
+
+  async install (directory: string) {
+    const $$ = this.getExec()
+
+    await $$`pip3 install -U -t ${directory} whisper-ctranslate2==${this.engine.version}`
+  }
+}
@@ -0,0 +1,2 @@
+export * from './ctranslate2-transcriber.js'
+export * from './openai-transcriber.js'
@@ -0,0 +1,77 @@
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { readJSON } from 'fs-extra/esm'
+import { parse } from 'node:path'
+import { join, resolve } from 'path'
+import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
+import { TranscriptFile, TranscriptFormat } from '../../transcript-file.js'
+
+export class OpenaiTranscriber extends AbstractTranscriber {
+
+  async transcribe ({
+    mediaFilePath,
+    model,
+    language,
+    format,
+    transcriptDirectory,
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = this.getExec(this.getExecEnv())
+
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+
+    await $$`${this.getEngineBinary()} ${[
+      mediaFilePath,
+      '--word_timestamps',
+      'True',
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(transcriptDirectory, mediaFilePath),
+      path: this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, format),
+      format
+    })
+  }
+
+  // ---------------------------------------------------------------------------
+
+  protected async getDetectedLanguage (transcriptDirectory: string, mediaFilePath: string) {
+    const { language } = await this.readJsonTranscriptFile(transcriptDirectory, mediaFilePath)
+
+    return language
+  }
+
+  protected async readJsonTranscriptFile (transcriptDirectory: string, mediaFilePath: string) {
+    return readJSON(this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, 'json'), 'utf8')
+  }
+
+  protected getTranscriptFilePath (transcriptDirectory: string, mediaFilePath: string, format: TranscriptFormat) {
+    return join(transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+  }
+
+  // ---------------------------------------------------------------------------
+
+  async install (directory: string) {
+    const $$ = this.getExec()
+
+    await $$`pip3 install -U -t ${[ directory ]} openai-whisper==${this.engine.version}`
+  }
+
+  protected getExecEnv () {
+    if (!this.binDirectory) return undefined
+
+    return { PYTHONPATH: resolve(this.binDirectory, '../') }
+  }
+}
@@ -0,0 +1,11 @@
+import { TranscriptionModel } from '../transcription-model.js'
+
+export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3'
+
+export class WhisperBuiltinModel extends TranscriptionModel {
+
+  // eslint-disable-next-line @typescript-eslint/no-useless-constructor
+  constructor (name: WhisperBuiltinModelName) {
+    super(name)
+  }
+}