はじまりの大地

2024-07-15 09:14:04 +09:00
@@ -0,0 +1,102 @@
+import { SimpleLogger } from '@peertube/peertube-models'
+import { buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+import { $ } from 'execa'
+import { PerformanceObserver } from 'node:perf_hooks'
+import { join } from 'path'
+import { TranscriptFile, TranscriptFormat } from './transcript-file.js'
+import { TranscriptionEngine } from './transcription-engine.js'
+import { TranscriptionModel } from './transcription-model.js'
+import { TranscriptionRun } from './transcription-run.js'
+
+export interface TranscribeArgs {
+  mediaFilePath: string
+  model: TranscriptionModel
+  format: TranscriptFormat
+  transcriptDirectory: string
+
+  language?: string
+  runId?: SUUID
+}
+
+export abstract class AbstractTranscriber {
+  engine: TranscriptionEngine
+
+  protected binDirectory: string
+  protected enginePath: string
+
+  protected logger: SimpleLogger
+
+  protected performanceObserver?: PerformanceObserver
+  protected run?: TranscriptionRun
+
+  constructor (options: {
+    engine: TranscriptionEngine
+    binDirectory?: string
+    enginePath?: string
+
+    logger: SimpleLogger
+    performanceObserver?: PerformanceObserver
+  }) {
+    const { engine, logger, enginePath, binDirectory, performanceObserver } = options
+
+    this.engine = engine
+    this.enginePath = enginePath
+    this.logger = logger
+    this.binDirectory = binDirectory
+    this.performanceObserver = performanceObserver
+  }
+
+  createRun (uuid: SUUID = buildSUUID()) {
+    this.run = new TranscriptionRun(this.logger, uuid)
+  }
+
+  startRun () {
+    this.run.start()
+  }
+
+  stopRun () {
+    this.run.stop()
+    delete this.run
+  }
+
+  assertLanguageDetectionAvailable (language?: string) {
+    if (!this.engine.languageDetection && !language) {
+      throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`)
+    }
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'PyTorch'
+  }
+
+  protected getEngineBinary () {
+    if (this.enginePath) return this.enginePath
+    if (this.binDirectory) return join(this.binDirectory, this.engine.command)
+
+    return this.engine.command
+  }
+
+  protected getExec (env?: { [ id: string ]: string }) {
+    const logLevels = {
+      command: 'debug',
+      output: 'debug',
+      ipc: 'debug',
+      error: 'error',
+      duration: 'debug'
+    }
+
+    return $({
+      verbose: (_verboseLine, { message, ...verboseObject }) => {
+        const level = logLevels[verboseObject.type]
+
+        this.logger[level](message, verboseObject)
+      },
+
+      env
+    })
+  }
+
+  abstract transcribe (options: TranscribeArgs): Promise<TranscriptFile>
+
+  abstract install (path: string): Promise<void>
+}
@@ -0,0 +1,12 @@
+import { TranscriberFactory } from './transcriber-factory.js'
+import { engines } from './whisper/index.js'
+
+export * from './abstract-transcriber.js'
+export * from './transcript-file.js'
+export * from './subtitle.js'
+export * from './transcription-engine.js'
+export * from './transcription-model.js'
+export * from './transcription-run.js'
+export * from './whisper/index.js'
+
+export const transcriberFactory = new TranscriberFactory(engines)
@@ -0,0 +1 @@
+export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')
@@ -0,0 +1,47 @@
+import { SimpleLogger } from '@peertube/peertube-models'
+import { TranscriptionEngine, TranscriptionEngineName } from './transcription-engine.js'
+import { Ctranslate2Transcriber, OpenaiTranscriber } from './whisper/index.js'
+
+export class TranscriberFactory {
+  engines: TranscriptionEngine[]
+
+  constructor (engines: TranscriptionEngine[]) {
+    this.engines = engines
+  }
+
+  createFromEngineName (options: {
+    engineName: TranscriptionEngineName
+    enginePath?: string
+    binDirectory?: string
+
+    logger: SimpleLogger
+  }) {
+    const { engineName } = options
+
+    const transcriberArgs = {
+      ...options,
+
+      engine: this.getEngineByName(engineName)
+    }
+
+    switch (engineName) {
+      case 'openai-whisper':
+        return new OpenaiTranscriber(transcriberArgs)
+
+      case 'whisper-ctranslate2':
+        return new Ctranslate2Transcriber(transcriberArgs)
+
+      default:
+        throw new Error(`Unimplemented engine ${engineName}`)
+    }
+  }
+
+  getEngineByName (engineName: string) {
+    const engine = this.engines.find(({ name }) => name === engineName)
+    if (!engine) {
+      throw new Error(`Unknow engine ${engineName}`)
+    }
+
+    return engine
+  }
+}
@@ -0,0 +1,70 @@
+import assert from 'node:assert'
+import { readFile, writeFile } from 'node:fs/promises'
+import { extname } from 'node:path'
+import { srtToTxt } from './subtitle.js'
+
+export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json'
+
+export class TranscriptFile {
+  path: string
+  language: string
+  format: TranscriptFormat = 'vtt'
+
+  constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) {
+    this.path = path
+    this.language = language
+    this.format = format
+  }
+
+  /**
+   * Asynchronously reads the entire contents of a transcript file.
+   * @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
+   */
+  async read (options: Parameters<typeof readFile>[1] = 'utf8') {
+    return readFile(this.path, options)
+  }
+
+  static fromPath (path: string, language = 'en') {
+    const format = extname(path).substring(1)
+
+    const guessableFormats = [ 'txt', 'vtt', 'srt' ]
+    assert(
+      guessableFormats.includes(format),
+      `Couldn't guess transcript format from extension "${format}". Valid formats are: ${guessableFormats.join(', ')}."`)
+
+    return new TranscriptFile({ path, language, format: format as TranscriptFormat })
+  }
+
+  /**
+   * Write a transcript file to disk.
+   */
+  static async write ({
+    path,
+    content,
+    language = 'en',
+    format = 'vtt'
+  }: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
+    await writeFile(path, content)
+
+    return new TranscriptFile({ path, language, format })
+  }
+
+  async equals (transcript: TranscriptFile, caseSensitive: boolean = true) {
+    if (this.language !== transcript.language) {
+      return false
+    }
+
+    const content = await this.read()
+    const transcriptContent = await transcript.read()
+
+    if (!caseSensitive) {
+      return String(content).toLowerCase() === String(transcriptContent).toLowerCase()
+    }
+
+    return content === transcriptContent
+  }
+
+  async readAsTxt () {
+    return srtToTxt(String(await this.read()))
+  }
+}
@@ -0,0 +1,16 @@
+import { ModelFormat } from './transcription-model.js'
+
+export type TranscriptionEngineName = 'openai-whisper' | 'whisper-ctranslate2'
+
+export interface TranscriptionEngine {
+  name: TranscriptionEngineName
+  description?: string
+  language?: string
+  type: 'binary'
+  command: string
+  version: string
+  license?: string
+  forgeURL?: string
+  supportedModelFormats: ModelFormat[]
+  languageDetection?: true
+}
@@ -0,0 +1,34 @@
+import assert from 'node:assert'
+import { stat } from 'node:fs/promises'
+import { parse } from 'node:path'
+
+export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
+
+export class TranscriptionModel {
+  name: string
+  format?: ModelFormat
+  path?: string
+
+  // #  - hparams
+  // #  - Number of dimensions (int)
+  // #  - Name length (int)
+  // #  - Dimensions (int[n_dims])
+  // #  - Name (char[name_length])
+  // #  - Data (float[n_dims])
+
+  // #  - mel filters
+  // #  - tokenizer vocab
+  // #  - model variables
+
+  constructor (name: string, path?: string, format?: ModelFormat) {
+    this.name = name
+    this.path = path
+    this.format = format
+  }
+
+  static async fromPath (path: string) {
+    assert(await stat(path), `${path} doesn't exist.`)
+
+    return new TranscriptionModel(parse(path).name, path)
+  }
+}
@@ -0,0 +1,41 @@
+import { SimpleLogger } from '@peertube/peertube-models'
+import { buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+
+export class TranscriptionRun {
+  uuid: SUUID
+  logger: SimpleLogger
+
+  constructor (logger: SimpleLogger, uuid: SUUID = buildSUUID()) {
+    this.uuid = uuid
+    this.logger = logger
+  }
+
+  get runId () {
+    return this.uuid
+  }
+
+  start () {
+    performance.mark(this.getStartPerformanceMarkName())
+  }
+
+  stop () {
+    try {
+      performance.mark(this.getEndPerformanceMarkName())
+      performance.measure(
+        this.runId,
+        this.getStartPerformanceMarkName(),
+        this.getEndPerformanceMarkName()
+      )
+    } catch (err) {
+      this.logger.error(err.message, { err })
+    }
+  }
+
+  getStartPerformanceMarkName () {
+    return `${this.runId}-started`
+  }
+
+  getEndPerformanceMarkName () {
+    return `${this.runId}-ended`
+  }
+}
@@ -0,0 +1,28 @@
+import { TranscriptionEngine } from '../transcription-engine.js'
+
+export const engines: TranscriptionEngine[] = [
+  {
+    name: 'openai-whisper',
+    description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    language: 'python',
+    type: 'binary',
+    command: 'whisper',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'PyTorch' ],
+    languageDetection: true,
+    version: '20231117'
+  },
+  {
+    name: 'whisper-ctranslate2',
+    description: 'Whisper command line client compatible with original OpenAI client based on CTranslate2.',
+    language: 'python',
+    type: 'binary',
+    command: 'whisper-ctranslate2',
+    forgeURL: 'https://github.com/Softcatala/whisper-ctranslate2',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true,
+    version: '0.4.4'
+  }
+]
@@ -0,0 +1,3 @@
+export * from './transcriber/index.js'
+export * from './engines.js'
+export * from './whisper-builtin-model.js'
@@ -0,0 +1,68 @@
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import assert from 'node:assert'
+import { lstat } from 'node:fs/promises'
+import { TranscribeArgs } from '../../abstract-transcriber.js'
+import { TranscriptFile } from '../../transcript-file.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import { OpenaiTranscriber } from './openai-transcriber.js'
+
+export class Ctranslate2Transcriber extends OpenaiTranscriber {
+
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format,
+    transcriptDirectory,
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = this.getExec(this.getExecEnv())
+
+    if (model.path) {
+      assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.')
+    }
+
+    const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.getEngineBinary()} ${[
+      mediaFilePath,
+      ...modelArgs,
+      '--word_timestamps',
+      'True',
+      '--vad_filter',
+      'true',
+      // Better precision with 5s of audio
+      // We mainly use vad_filter to improve language detection (first 30 seconds of the video, so no voice is problematic)
+      '--vad_min_silence_duration_ms',
+      '5000',
+      '--output_format',
+      'all',
+      '--output_dir',
+      transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(transcriptDirectory, mediaFilePath),
+      path: this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, format),
+      format
+    })
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'CTranslate2'
+  }
+
+  async install (directory: string) {
+    const $$ = this.getExec()
+
+    await $$`pip3 install -U -t ${directory} whisper-ctranslate2==${this.engine.version}`
+  }
+}
@@ -0,0 +1,2 @@
+export * from './ctranslate2-transcriber.js'
+export * from './openai-transcriber.js'
@@ -0,0 +1,77 @@
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { readJSON } from 'fs-extra/esm'
+import { parse } from 'node:path'
+import { join, resolve } from 'path'
+import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
+import { TranscriptFile, TranscriptFormat } from '../../transcript-file.js'
+
+export class OpenaiTranscriber extends AbstractTranscriber {
+
+  async transcribe ({
+    mediaFilePath,
+    model,
+    language,
+    format,
+    transcriptDirectory,
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = this.getExec(this.getExecEnv())
+
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+
+    await $$`${this.getEngineBinary()} ${[
+      mediaFilePath,
+      '--word_timestamps',
+      'True',
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(transcriptDirectory, mediaFilePath),
+      path: this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, format),
+      format
+    })
+  }
+
+  // ---------------------------------------------------------------------------
+
+  protected async getDetectedLanguage (transcriptDirectory: string, mediaFilePath: string) {
+    const { language } = await this.readJsonTranscriptFile(transcriptDirectory, mediaFilePath)
+
+    return language
+  }
+
+  protected async readJsonTranscriptFile (transcriptDirectory: string, mediaFilePath: string) {
+    return readJSON(this.getTranscriptFilePath(transcriptDirectory, mediaFilePath, 'json'), 'utf8')
+  }
+
+  protected getTranscriptFilePath (transcriptDirectory: string, mediaFilePath: string, format: TranscriptFormat) {
+    return join(transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+  }
+
+  // ---------------------------------------------------------------------------
+
+  async install (directory: string) {
+    const $$ = this.getExec()
+
+    await $$`pip3 install -U -t ${[ directory ]} openai-whisper==${this.engine.version}`
+  }
+
+  protected getExecEnv () {
+    if (!this.binDirectory) return undefined
+
+    return { PYTHONPATH: resolve(this.binDirectory, '../') }
+  }
+}
@@ -0,0 +1,11 @@
+import { TranscriptionModel } from '../transcription-model.js'
+
+export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3'
+
+export class WhisperBuiltinModel extends TranscriptionModel {
+
+  // eslint-disable-next-line @typescript-eslint/no-useless-constructor
+  constructor (name: WhisperBuiltinModelName) {
+    super(name)
+  }
+}
				`@@ -0,0 +1 @@`
				`export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')`