はじまりの大地

2024-07-15 09:14:04 +09:00
@@ -0,0 +1,63 @@
+# Transcription DevTools
+
+Includes:
+  * __JiWER__ CLI NodeJS wrapper
+  * Benchmark tool to test multiple transcription engines
+  * TypeScript classes to evaluate word-error-rate of files generated by the transcription
+
+## Build
+
+```sh
+npm run build
+```
+
+## Benchmark
+
+A benchmark of available __transcribers__ might be run with:
+```sh
+npm run benchmark
+```
+```
+┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
+│        (index)         │          WER          │          CER          │ duration │ model  │        engine         │
+├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
+│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%'  │  '9.62457337883959%'  │  '41s'   │ 'tiny' │   'openai-whisper'    │
+│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%'  │ '10.46195652173913%'  │  '15s'   │ 'tiny' │ 'whisper-ctranslate2' │
+└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
+```
+
+The benchmark may be run with multiple model builtin sizes:
+
+```sh
+MODELS=tiny,small,large npm run benchmark
+```
+
+## Jiwer
+
+> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
+> https://jitsi.github.io/jiwer/cli/
+
+__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
+- WER (Word Error Rate)
+- CER (Character Error Rate)
+
+
+### Usage
+
+```typescript
+const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
+
+// WER as a percentage, ex: 0.03 -> 3%
+console.log(await jiwerCLI.wer())
+
+// CER as a percentage: 0.01 -> 1%
+console.log(await jiwerCLI.cer())
+
+// Detailed comparison report
+console.log(await jiwerCLI.alignment())
+```
+
+## Resources
+
+- https://jitsi.github.io/jiwer/
+- https://github.com/rapidfuzz/RapidFuzz
@@ -0,0 +1,20 @@
+{
+  "name": "@peertube/peertube-transcription-devtools",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "build": "tsc",
+    "watch": "tsc -w",
+    "benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
+  },
+  "dependencies": {}
+}
@@ -0,0 +1 @@
+jiwer
@@ -0,0 +1,142 @@
+import { millisecondsToTime } from '@peertube/peertube-core-utils'
+import { SUUID, buildAbsoluteFixturePath, buildSUUID } from '@peertube/peertube-node-utils'
+import {
+  TranscriptFile,
+  TranscriptionEngine,
+  TranscriptionEngineName,
+  TranscriptionModel,
+  transcriberFactory
+} from '@peertube/peertube-transcription'
+import { ensureDir, remove } from 'fs-extra/esm'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { PerformanceObserver, performance } from 'node:perf_hooks'
+import { createLogger, format, transports } from 'winston'
+import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
+
+interface BenchmarkResult {
+  uuid: SUUID
+  WER?: number
+  CER?: number
+  duration?: number
+  engine?: TranscriptionEngine
+  model?: string
+}
+
+type Benchmark = Record<SUUID, BenchmarkResult>
+
+const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
+  ...benchmark,
+  [benchmarkResult.uuid]:  {
+    ...benchmark[benchmarkResult.uuid],
+    ...benchmarkResult
+  }
+})
+
+const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
+  ...benchmarksGroupedByModel,
+  [benchmarkResults[uuid].model]: {
+    ...benchmarksGroupedByModel[benchmarkResults[uuid].model],
+    [uuid]: formatBenchmarkResult(benchmarkResults[uuid])
+  }
+})
+
+interface FormattedBenchmarkResult {
+  WER?: string
+  CER?: string
+  duration?: string
+  model?: string
+  engine?: string
+}
+
+const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
+  WER: WER ? `${WER * 100}%` : undefined,
+  CER: CER ? `${CER * 100}%` : undefined,
+  duration: duration ? millisecondsToTime(duration) : undefined,
+  model,
+  engine: engine.name
+})
+
+void (async () => {
+  const logger = createLogger()
+  logger.add(new transports.Console({ format: format.printf(log => log.message) }))
+
+  const transcribers: TranscriptionEngineName[] = [ 'openai-whisper', 'whisper-ctranslate2' ]
+  const models = process.env.MODELS
+    ? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
+    : [ 'tiny' ]
+
+  const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
+  const pipDirectory = join(tmpdir(), 'peertube-transcription', 'pip')
+
+  const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const referenceTranscriptFile = new TranscriptFile({
+    path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
+    language: 'fr',
+    format: 'txt'
+  })
+
+  let benchmarkResults: Record<string, BenchmarkResult> = {}
+
+  // before
+  await ensureDir(transcriptDirectory)
+  const performanceObserver = new PerformanceObserver((items) => {
+    items
+      .getEntries()
+      .forEach((entry) => {
+        benchmarkResults = benchmarkReducer(benchmarkResults, {
+          uuid: entry.name as SUUID,
+          duration: entry.duration
+        })
+      })
+  })
+  performanceObserver.observe({ type: 'measure' })
+
+  // benchmark
+  logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
+  for (const transcriberName of transcribers) {
+    logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
+
+    const transcriber = transcriberFactory.createFromEngineName({
+      engineName: transcriberName,
+      logger: createLogger({ transports: [ new transports.Console() ] }),
+      binDirectory: join(pipDirectory, 'bin')
+    })
+
+    await transcriber.install(pipDirectory)
+
+    for (const modelName of models) {
+      logger.info(`Run benchmark with "${modelName}" model:`)
+      const model = new TranscriptionModel(modelName)
+      const uuid = buildSUUID()
+      const transcriptFile = await transcriber.transcribe({
+        mediaFilePath,
+        model,
+        transcriptDirectory,
+        language: 'fr',
+        format: 'txt',
+        runId: uuid
+      })
+      const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
+      await new Promise(resolve => setTimeout(resolve, 1))
+
+      benchmarkResults = benchmarkReducer(benchmarkResults, {
+        uuid,
+        engine: transcriber.engine,
+        WER: await evaluator.wer(),
+        CER: await evaluator.cer(),
+        model: model.name
+      })
+    }
+  }
+
+  // display
+  const benchmarkResultsGroupedByModel = Object
+    .keys(benchmarkResults)
+    .reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
+  Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
+
+  // after
+  await remove(transcriptDirectory)
+  performance.clearMarks()
+})()
@@ -0,0 +1,5 @@
+export * from './jiwer-cli.js'
+export * from './levenshtein.js'
+export * from './transcript-file-evaluator-interface.js'
+export * from './transcript-file-evaluator.js'
+export * from './utils.js'
@@ -0,0 +1,69 @@
+import { $ } from 'execa'
+
+export class JiwerClI {
+  referenceFilePath: string
+  hypothesisFilePath: string
+
+  constructor (referenceFilePath: string, hypothesisFilePath: string) {
+    this.referenceFilePath = referenceFilePath
+    this.hypothesisFilePath = hypothesisFilePath
+  }
+
+  /**
+   * @param referenceFilePath Path to new-line delimited text file of reference sentences.
+   * @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
+   * @param args
+   */
+  static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
+    return [
+      '--reference',
+      referenceFilePath,
+      '--hypothesis',
+      hypothesisFilePath,
+      ...args
+    ]
+  }
+
+  buildArgs (...args: string[]) {
+    return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
+  }
+
+  /**
+   * WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
+   */
+  static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
+
+    return Number(wer)
+  }
+
+  async wer (global = true) {
+    return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
+
+    return Number(cer)
+  }
+
+  async cer (global = true) {
+    return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * Print alignment of each sentence.
+   */
+  static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
+    const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
+
+    return alignment
+  }
+
+  async alignment (global = true) {
+    return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+}
@@ -0,0 +1,101 @@
+function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
+  return d0 < d1 || d2 < d1
+    ? d0 > d2
+      ? d2 + 1
+      : d0 + 1
+    : bx === ay
+      ? d1
+      : d1 + 1
+}
+
+/**
+ * @see https://github.com/gustf/js-levenshtein
+ */
+export function levenshteinDistance (a: string, b: string): number {
+  if (a === b) {
+    return 0
+  }
+
+  if (a.length > b.length) {
+    const tmp = a
+    a = b
+    b = tmp
+  }
+
+  let la = a.length
+  let lb = b.length
+
+  while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
+    la--
+    lb--
+  }
+
+  let offset = 0
+
+  while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
+    offset++
+  }
+
+  la -= offset
+  lb -= offset
+
+  if (la === 0 || lb < 3) {
+    return lb
+  }
+
+  let x = 0
+  let y: number
+  let d0: number
+  let d1: number
+  let d2: number
+  let d3: number
+  let dd: number
+  let dy: number
+  let ay: number
+  let bx0: number
+  let bx1: number
+  let bx2: number
+  let bx3: number
+
+  const vector: number[] = []
+
+  for (y = 0; y < la; y++) {
+    vector.push(y + 1)
+    vector.push(a.charCodeAt(offset + y))
+  }
+
+  const len = vector.length - 1
+
+  for (; x < lb - 3;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    bx1 = b.charCodeAt(offset + (d1 = x + 1))
+    bx2 = b.charCodeAt(offset + (d2 = x + 2))
+    bx3 = b.charCodeAt(offset + (d3 = x + 3))
+    dd = (x += 4)
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      ay = vector[y + 1]
+      d0 = min(dy, d0, d1, bx0, ay)
+      d1 = min(d0, d1, d2, bx1, ay)
+      d2 = min(d1, d2, d3, bx2, ay)
+      dd = min(d2, d3, dd, bx3, ay)
+      vector[y] = dd
+      d3 = d2
+      d2 = d1
+      d1 = d0
+      d0 = dy
+    }
+  }
+
+  for (; x < lb;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    dd = ++x
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
+      d0 = dy
+    }
+  }
+
+  return dd
+}
@@ -0,0 +1,12 @@
+export interface TranscriptFileEvaluation {
+  wer: number
+  cer: number
+  alignment: string
+}
+
+export interface TranscriptFileEvaluatorInterface {
+  wer(): Promise<number>
+  cer(): Promise<number>
+  alignment(): Promise<string>
+  evaluate(): Promise<TranscriptFileEvaluation>
+}
@@ -0,0 +1,46 @@
+import assert from 'node:assert'
+import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
+import { TranscriptFile } from '@peertube/peertube-transcription'
+import { JiwerClI } from './jiwer-cli.js'
+
+export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
+  referenceTranscriptFile: TranscriptFile
+  hypothesisTranscriptFile: TranscriptFile
+  jiwerCLI: JiwerClI
+
+  constructor (referenceTranscriptFile: TranscriptFile, hypothesisTranscriptFile: TranscriptFile) {
+    assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+    assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+
+    this.referenceTranscriptFile = referenceTranscriptFile
+    this.hypothesisTranscriptFile = hypothesisTranscriptFile
+
+    this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
+  }
+
+  /**
+   * WER: Word Error Rate
+   */
+  wer () {
+    return this.jiwerCLI.wer()
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  cer () {
+    return this.jiwerCLI.cer()
+  }
+
+  alignment () {
+    return this.jiwerCLI.alignment()
+  }
+
+  async evaluate () {
+    return {
+      wer: await this.wer(),
+      cer: await this.cer(),
+      alignment: await this.alignment()
+    }
+  }
+}
@@ -0,0 +1,32 @@
+import { join, parse } from 'node:path'
+import { createWriteStream } from 'node:fs'
+import { lstat, unlink } from 'node:fs/promises'
+import assert from 'node:assert'
+import { $ } from 'execa'
+import { makeFileRequest } from '@peertube/peertube-server-commands'
+
+export const downloadFile = async (url: string, targetDirectory: string) => {
+  const { base } = parse(url)
+  const filePath = join(targetDirectory, base)
+
+  const fileStream = createWriteStream(filePath)
+  const stream = makeFileRequest(url).pipe(fileStream)
+
+  return await new Promise((resolve: (filePath: string) => void, reject) => {
+    stream.on('finish', () => resolve(filePath))
+    stream.on('error', async e => {
+      fileStream.close()
+      await unlink(filePath)
+      reject(e.message)
+    })
+  })
+}
+
+export const unzip = async (zipFilePath: string) => {
+  assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
+  const { dir, name } = parse(zipFilePath)
+
+  await $`unzip -o ${zipFilePath} -d ${dir}`
+
+  return join(dir, name)
+}
@@ -0,0 +1,11 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo",
+  },
+  "references": [
+    { "path": "../transcription" }
+  ]
+}